require(pacman)
p_load(tidyverse,dlookr,patchwork,skimr,kableExtra,naniar,lubridate,purrr,tidyr,
       crosstable,flextable,scales,survival,survminer,DataExplorer,performance,
       missForest,HDoutliers,caret,praznik,mRMRe,VarSelLCM,plotly,ca,factoextra,
       Rmixmod,kamila)

Data

# I load the data from csv file:
lung_data <- read.csv('data/complete_clinrad_data.csv', sep = ',') # called complete clin rad data

# I assign Patient ID to rownames to ease further work with the data
rownames(lung_data) <- lung_data$PatientID

#I remove PatientID column an additional index column not of interest as we will use rownames(patient id) to identify each entry
lung_data <- lung_data %>% select(.,-c(X,PatientID))

# I check dataframe dimensions:
dim(lung_data)
## [1]  421 1257
# As many clinical variables are identified as numeric when this is not the case, 
# I modify data type for specific variables of interest
# clinical T,N,M and overall stage are ordered categorical variables:
lung_data$clinical.T.Stage <- factor(lung_data$clinical.T.Stage, ordered = TRUE)
lung_data$Clinical.N.Stage <- factor(lung_data$Clinical.N.Stage, ordered = TRUE)
lung_data$Clinical.M.Stage <- factor(lung_data$Clinical.M.Stage, ordered = TRUE)
lung_data$Overall.Stage <- factor(lung_data$Overall.Stage, ordered = TRUE)

# dead status event is a binary categorical variable:
lung_data$deadstatus.event <- factor(lung_data$deadstatus.event)

# gender, histology, and manufacturer are all categorical variables also:
lung_data$gender <- factor(lung_data$gender)
lung_data$Histology <- factor(lung_data$Histology)
lung_data$Manufacturer <- factor(lung_data$Manufacturer)

# study date is a date variable, we adjust format with the help of lubridate package:
lung_data$Study.Date <- mdy(lung_data$Study.Date)

# I check resulting data structure and varnames for the modified variables:
str(lung_data[1:11])
## 'data.frame':    421 obs. of  11 variables:
##  $ age             : num  78.8 83.8 68.2 70.9 80.5 ...
##  $ clinical.T.Stage: Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 2 2 2 2 4 3 2 2 2 4 ...
##  $ Clinical.N.Stage: Ord.factor w/ 5 levels "0"<"1"<"2"<"3"<..: 4 1 4 2 3 2 3 3 3 4 ...
##  $ Clinical.M.Stage: Ord.factor w/ 3 levels "0"<"1"<"3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Overall.Stage   : Ord.factor w/ 4 levels "I"<"II"<"IIIa"<..: 4 1 4 2 4 3 3 3 3 4 ...
##  $ Histology       : Factor w/ 4 levels "adenocarcinoma",..: 2 4 2 4 4 4 4 1 4 4 ...
##  $ gender          : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Survival.time   : int  2165 155 256 141 353 173 137 77 131 2119 ...
##  $ deadstatus.event: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 1 ...
##  $ Study.Date      : Date, format: "2008-09-18" "2014-01-01" ...
##  $ Manufacturer    : Factor w/ 3 levels "CMS Inc.","Plastimatch",..: 3 1 1 3 1 1 3 1 1 1 ...
# We can see the remaining numeric variables account for all the radiomic variables (1246)
dim(lung_data %>% keep(is.numeric) %>% select(.,-c(age,Survival.time)))
## [1]  421 1246

Main research question is weather there is an association between CT radiomic features and NSCLC histology subtype and/or with patient survival.

I will do an initial exploratory analysis including:

  • Search for missing values
  • Identify distribution of the different variables and check coherence
  • Check relationship/independency between different type of variables
  • Search for outliers

I will continue with different approaches for feature selection including:

  • Features filtering for radiomic variables (exclude near cerovariance, highly correlated variables and retain variable most related with target clinical variables)
  • Apply dimensionality reduction by using principal component analysis on radiomic data

Finally I will applying model based clustering to the different reduced datasets generated in the previous points and I will analyze results by:

  • Exploring within and across cluster variable distribution
  • Compare agreement between groups generated by different reduced datasets generated in previous point
  • Analyze agreement and relationship between cluster groups and the variables of interest

Initial exploratory analysis

Search for missing values

First I do an exploratory analysis to check for missing values in the whole data frame.

table(is.na(lung_data))
## 
##  FALSE   TRUE 
## 529131     66
table(is.null(lung_data))
## 
## FALSE 
##     1

We can see there are 66 missing values in total.

I generate a table that shows within the whole dataframe the variables with a list one missing value and the detail of number of missing values and its completeness rate.

(table1 <- skim(lung_data) %>% 
   select(skim_variable,n_missing,complete_rate) %>% 
   filter(n_missing > 0) %>% 
   kable(full_with = FALSE) %>% 
   kable_classic_2()) 
skim_variable n_missing complete_rate
clinical.T.Stage 1 0.9976247
Overall.Stage 1 0.9976247
Histology 42 0.9002375
age 22 0.9477435
table1 %>% save_kable("tables/table1.pdf")

I generate visual representations of single and joint missing values using gg_miss_upset function from naniar package.

tiff(filename = "figures/Fig1.tiff", width = 200, height = 150, units = "mm", res = 300)
gg_miss_upset(lung_data %>% select_if(~ any(is.na(.)))) 
fig <- dev.off()
gg_miss_upset(lung_data %>% select_if(~ any(is.na(.))))

Overall completeness rate was superior to 0.9 for every variable. Histology is our response variable for our first research question so observations without a value for histology variable will be fully excluded. In addition as a subgroup of NSCLC not otherwise specified (nos) is available within the histology categories it is not clear if NA cases have in effect the confirmation of NSCLC diagnosis at all; so it makes sense to exclude this cases.

lung_data_complete <- lung_data %>% drop_na(Histology)
dim(lung_data_complete)
## [1]  379 1257

After excluding every row with missing histology value, we end up with 379 observations. On the contrary we might imput missing values affecting other variables to avoid missing additional observations that may be usefull, but I will continue with imputation after evaluation data distribution, and data coherence to check if any additional values should be considered as missing and which imputation technique might be pertinent to use.

Identify distribution of the different variables and check coherence

I verify general statistical summary and distribution of features.

First for study dates:

lung_data_complete %>% select(Study.Date) %>% skim()
Data summary
Name Piped data
Number of rows 379
Number of columns 1
_______________________
Column type frequency:
Date 1
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
Study.Date 0 1 2004-09-27 2014-01-01 2008-09-07 283

Then I get summary statistics for factor variables:

(table2a <- lung_data_complete %>% 
    select(Histology,gender,deadstatus.event,Manufacturer) %>% 
    univar_category() %>% 
    kable(full_with = FALSE) %>% kable_classic())
Histology n rate
adenocarcinoma 51 0.1345646
large cell 114 0.3007916
nos 62 0.1635884
squamous cell carcinoma 152 0.4010554
gender n rate
female 119 0.3139842
male 260 0.6860158
deadstatus.event n rate
0 43 0.1134565
1 336 0.8865435
Manufacturer n rate
CMS Inc.  86 0.2269129
Plastimatch 1 0.0026385
SIEMENS 292 0.7704485
table2a %>% save_kable("tables/table2a.pdf")
(table2b <- lung_data_complete %>% 
    select(Overall.Stage,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage) %>% 
    univar_category() %>% 
    kable(full_with = FALSE) %>% kable_classic())
Overall.Stage n rate
I 66 0.1741425
II 38 0.1002639
IIIa 108 0.2849604
IIIb 166 0.4379947
NA 1 0.0026385
clinical.T.Stage n rate
1 69 0.1820580
2 147 0.3878628
3 50 0.1319261
4 111 0.2928760
5 1 0.0026385
NA 1 0.0026385
Clinical.N.Stage n rate
0 140 0.3693931
1 21 0.0554090
2 134 0.3535620
3 81 0.2137203
4 3 0.0079156
Clinical.M.Stage n rate
0 375 0.9894459
3 4 0.0105541
table2b %>% save_kable("tables/table2b.pdf")

I represent factor variables with barplots to visualize data distribution and clases available for each categorical variable:

lung_data_complete %>%
  keep(is.factor) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_bar()+
    scale_y_continuous( limits = c(0,400)) +
    geom_text(stat='count', aes(label=..count..), vjust=-0.1, size = 3)

ggsave("figures/Fig2.tiff",width = 174, height = 130, device="tiff", dpi=300, units = "mm", scale = 1)

We can see from the Overall.Stage variable distribution, that stage IV is not represented in this dataset so we will center analysis mainly on this groups of non-metastatic disease patients. There seems to be some isolated incoherent, erroneous registries regarding clinical T, N and M categories. Possible values for clinical.T.Stage category in lung cancer range from T1 to T4, and as we can see there are few cases with T == 5. Same happens with Clinical.N.Stage == 4 and we also find some cases with Clinical M Stage == 3 that do not exists either within this classicifcation. Specific incoherent cases are enumerated in tables 5.3, 5.4, 5.5 respectively.

(table3 <- lung_data_complete %>% 
  filter(clinical.T.Stage ==5) %>% select(clinical.T.Stage,Overall.Stage)  %>% 
  kable(full_with = FALSE) %>% 
  kable_classic_2())
clinical.T.Stage Overall.Stage
LUNG1-272 5 NA
table3 %>% save_kable("tables/table3.pdf")
(table4 <- lung_data_complete %>% 
  filter(Clinical.N.Stage ==4) %>% select(Clinical.N.Stage,Overall.Stage) %>% 
  kable(full_with = FALSE) %>% 
  kable_classic_2())
Clinical.N.Stage Overall.Stage
LUNG1-167 4 IIIb
LUNG1-232 4 IIIb
LUNG1-292 4 IIIb
table4 %>% save_kable("tables/table4.pdf")
(table5 <- lung_data_complete %>% filter(Clinical.M.Stage ==3) %>% 
  select(Clinical.M.Stage,Overall.Stage) %>% 
  kable(full_with = FALSE) %>% 
  kable_classic_2()) 
Clinical.M.Stage Overall.Stage
LUNG1-072 3 IIIb
LUNG1-256 3 IIIb
LUNG1-269 3 IIIa
LUNG1-333 3 IIIa
table5 %>% save_kable("tables/table5.pdf")

These are interpreted as erroneous so missing values are assigned to replace these entries as well as for plastimatch manufacturer. This will be then handled along with missing values imputation.

# I assign na values to erroneous entries and drop unused levels:
lung_data_complete <- lung_data_complete %>% replace_with_na(
  replace = list(clinical.T.Stage = 5,
                 Clinical.N.Stage = 4,
                 Clinical.M.Stage = 3,
                 Manufacturer = 'Plastimatch')) %>%
  droplevels.data.frame()

We can see now the joint missing patterns after removing observations with histology missing values and imputing with missing values the incoherent clinical T/N/M entries:

tiff(filename = "figures/Fig3.tiff", width = 200, height = 150, units = "mm", res = 300)
gg_miss_upset(lung_data_complete %>% select_if(~ any(is.na(.)))) 
fig <- dev.off()
gg_miss_upset(lung_data_complete %>% select_if(~ any(is.na(.))))

Now I continue with the summary statistics for numeric clinical numeric variables:

(table6 <- lung_data_complete %>% 
  select(age,Survival.time) %>% 
  describe() %>%
  select(described_variables,n,na,mean,sd,se_mean,IQR,skewness,kurtosis) %>% 
  kable(full_with = FALSE) %>% kable_classic_2()) 
described_variables n na mean sd se_mean IQR skewness kurtosis
age 366 13 68.1016 10.14966 0.5305312 14.74875 -0.3099521 -0.3094659
Survival.time 379 0 983.6464 1020.99688 52.4450869 1126.00000 1.4764935 1.3393722
table6 %>% save_kable("tables/table6.pdf")

And generate histograms to better visualize distributions:

  • First for the remaining clinical variables:
g1 <- ggplot(lung_data_complete,aes(age)) + 
  geom_histogram(alpha = .8) +
  theme_bw() 
g2 <- ggplot(lung_data_complete,aes(Survival.time)) + 
  geom_histogram(alpha = .8) +
  theme_bw()
(g1 + g2)

ggsave("figures/Fig4.tiff",width = 174, height = 100, device="tiff", dpi=300, units = "mm", scale = 1)
  • As we have 1246 radiomic variables, I’ll continue exploring clinical and main study info first, and continue with radiomics exploratory analysis while and after performing variable selection

Check relationship/independency between different type of variables

First as my first research question is weather radiomic variables are related with histology class or not, I want to check histology is independent from other clinical variables and manufacturer of the scanner used to do the exam.

g1 <- ggplot(lung_data_complete,aes(age, fill = Histology)) + 
  geom_boxplot(alpha = .5) +
  coord_flip() +
  theme_bw() +
  theme(legend.position = 'none')

g2 <- ggplot(lung_data_complete,aes(gender, fill = Histology)) + 
  geom_bar(position = 'dodge',alpha = .5) +
  theme_bw() +
  theme(legend.position = 'none')

g3 <- ggplot(lung_data_complete,aes(Overall.Stage, fill = Histology)) + 
  geom_bar(position = 'dodge',alpha = .5) +
  theme_bw() 

g4 <- ggplot(lung_data_complete,aes(Manufacturer, fill = Histology)) + 
  geom_bar(position = 'dodge',alpha = .5) +
  theme_bw() 

(g1 + g2)/(g3 + g4) + plot_layout(guides = "collect") 

ggsave("figures/Fig5.tiff",width = 174, height = 100, device="tiff", dpi=300, units = "mm", scale = 1)

I check the relationship between categorical variables with main variables of interest. I use crosstable package to ease table construction and automatic test to search for independency.

(table7 <- crosstable(lung_data_complete, c(age, gender, Overall.Stage, Manufacturer), 
           by = Histology, test = TRUE) %>% as_flextable() %>% autofit()) 
 table7 %>% save_as_image("tables/table7.png")
## [1] "/Users/mechyserra/Drive_SerraMM/Master_BioinformaticayBioestadistica/Trabajo_Fin_de_Master/TFM/TFM_MSerra/TFM_MSERRA_2022/tables/table7.png"
(table8 <- crosstable(lung_data_complete, c(age, gender, Manufacturer), 
           by = Overall.Stage, test = TRUE) %>% as_flextable() %>% autofit())
table8 %>% save_as_image("tables/table8.png")
## [1] "/Users/mechyserra/Drive_SerraMM/Master_BioinformaticayBioestadistica/Trabajo_Fin_de_Master/TFM/TFM_MSerra/TFM_MSERRA_2022/tables/table8.png"

In this tables we can easily see how gender and histology, and manufacturer and histology are independent. On the contrary, there seems to be some relationship between histology group and age, as well as between histology and overall stage. In this case it seems specially important to verify the relationship of these variables with any clustering model generated as these could bias interpretation of clustering association to target histology variable.

I continue verifying the relationship between ordinal variables using Spearman correlation coeficient

(table9 <- lung_data_complete %>% 
  select(Overall.Stage,clinical.T.Stage) %>% 
  drop_na(Overall.Stage,clinical.T.Stage) %>% 
  table()%>%
  addmargins() %>% 
  kable(full_with = FALSE) %>% kable_classic_2() %>%  
  add_header_above(c("Overall.Stage" = 1, "Clinical.T.Stage" = 5)) )
Overall.Stage
Clinical.T.Stage
1 2 3 4 Sum
I 27 39 0 0 66
II 3 18 17 0 38
IIIa 26 55 27 0 108
IIIb 13 35 6 111 165
Sum 69 147 50 111 377
table9 %>% save_kable("tables/table9.pdf")
cor(as.integer(lung_data_complete$clinical.T.Stage),
    as.integer(lung_data_complete$Overall.Stage),
    method = 'spearman',
    use = 'complete.obs')
## [1] 0.5841231
(table10 <- lung_data_complete %>% 
  select(Overall.Stage,Clinical.N.Stage) %>% 
  drop_na(Overall.Stage,Clinical.N.Stage) %>% 
  table()%>%
  addmargins() %>% 
  kable(full_width = FALSE) %>% 
  kable_classic_2() %>%  
  add_header_above(c("Overall.Stage" = 1, "Clinical.N.Stage" = 5))) 
Overall.Stage
Clinical.N.Stage
0 1 2 3 Sum
I 65 1 0 0 66
II 25 13 0 0 38
IIIa 0 4 96 8 108
IIIb 50 3 37 73 163
Sum 140 21 133 81 375
table10 %>% save_kable("tables/table10.pdf")
cor(as.integer(lung_data_complete$Clinical.N.Stage),
        as.integer(lung_data_complete$Overall.Stage),
        method = 'spearman',
        use = 'complete.obs')
## [1] 0.5245729
(table11 <- lung_data_complete %>% 
  select(clinical.T.Stage,Clinical.N.Stage) %>% 
  drop_na(clinical.T.Stage,Clinical.N.Stage) %>% 
  table()%>%
  addmargins() %>% 
  kable(full_width = FALSE) %>% 
  kable_classic_2() %>%  
  add_header_above(c("Clinical.T.Stage" = 1, "Clinical.N.Stage" = 5))) 
Clinical.T.Stage
Clinical.N.Stage
0 1 2 3 Sum
1 28 3 21 17 69
2 46 11 55 35 147
3 17 4 20 9 50
4 49 3 37 19 108
Sum 140 21 133 80 374
table11 %>% save_kable("tables/table11.pdf")
cor(as.integer(lung_data_complete$Clinical.N.Stage),
    as.integer(lung_data_complete$clinical.T.Stage),
    method = 'spearman',
    use = 'complete.obs')
## [1] -0.06926156

As expected we see there is some degree of correlation between Overall Stage and T and N clinical categories. On the contrary T and N categories do not show a high degree of correlation.

I check the distribution of different histology classes along study dates to check if there might be any suggestion of potential batch bias.

lung_data_complete$Study.Date <- as.POSIXct(lung_data_complete$Study.Date)
ggplot(lung_data_complete, aes(Study.Date, ..count.., fill = Histology)) + 
    geom_histogram(alpha = .5) +
    theme_bw() + xlab(NULL) +
    scale_x_datetime(breaks = date_breaks("3 months"),
                     labels = date_format("%Y-%b")) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggsave("figures/Fig6.tiff",width = 174, height = 100, device="tiff", dpi=300, units = "mm", scale = 1)

Regarding the period of time during which studies were performed, we see all classes seem pretty much represented along the years in favor of avoiding specific technique batch biases.

Now I want to check if there is a relationship between survival and main categorical classes. To do so I will use Survival and survminer R packages to construct Kaplan Meyer plots using Survival.time and deadstatus.event variables and compare mean survival between different classes.

tiff(filename = "figures/Fig7.tiff", width = 300, height = 200, units = "mm", res = 300)
ggsurvplot(survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ Histology, data = lung_data_complete), 
           conf.int=TRUE, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           tables.height = 0.2)
fig <- dev.off()
ggsurvplot(survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ Histology, data = lung_data_complete), 
           conf.int=TRUE, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           tables.height = 0.2)

tiff(filename = "figures/Fig8.tiff", width = 300, height = 200, units = "mm", res = 300)
ggsurvplot(survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ Overall.Stage, data = lung_data_complete), 
           conf.int=TRUE, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           tables.height = 0.2)
fig <- dev.off()
ggsurvplot(survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ Overall.Stage, data = lung_data_complete), 
           conf.int=TRUE, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           tables.height = 0.2)

tiff(filename = "figures/Fig9.tiff", width = 300, height = 200, units = "mm", res = 300)
ggsurvplot(survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ gender, data = lung_data_complete), 
           conf.int=TRUE, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
            tables.height = 0.2)
fig <- dev.off()
ggsurvplot(survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ gender, data = lung_data_complete), 
           conf.int=TRUE, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
            tables.height = 0.2)

tiff(filename = "figures/Fig10.tiff", width = 300, height = 200, units = "mm", res = 300)
ggsurvplot(survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ Manufacturer, data = lung_data_complete), 
           conf.int=TRUE, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
            tables.height = 0.2)
fig <- dev.off()
ggsurvplot(survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ Manufacturer, data = lung_data_complete), 
           conf.int=TRUE, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
            tables.height = 0.2)

While there is no significant relationship between Survival and Histology, Survival and Overall Stage, and Survival and Gender; there is a finding of major importance here regarding observations corresponding to scanners from different Manufacturer. This could add a major bias to the model so we have to exclude any radiomic features related to scanner Manufacturer to make sure to remove any bias from this source.

Relationship between radiomic numeric variables

As there are so many continuous radiomic variables I will just check for the moment their pearson correlation coefficient and represent this with heatmap. After performing variable selection I will further analyze radiomic variables distribution and relationship with other clinical variables.

radiomic_vars <- lung_data_complete %>% keep(is.numeric) %>%
  select(.,-c(age,Survival.time))
plot_correlation(radiomic_vars, type = "c", theme_config = list("legend.position" = "bottom","axis.text.x" = element_blank(), "axis.text.y" = element_blank()))

ggsave("figures/Fig11.tiff",width = 200, height = 200, device="tiff", dpi=300, units = "mm", scale = 1)

Now I will perform a Shapiro Wilk test to formaly evaluate normality for every numeric variable available in current data frame:

# I apply shapiro wilk test to every numeric value in the data frame
shap <- lung_data_complete %>% 
  keep(is.numeric) %>% 
  apply(.,2,shapiro.test) 

# I define a vector with a boolean stating if p value is greater than corrected 0.05
shapres <- sapply(shap, function(x) x$p.value > .05/length(shap))
table(shapres)
## shapres
## FALSE  TRUE 
##  1081   167

When exploring numerical variable distribution we could already see many variables with an assymetric distribution mostly non-normally distributed variables. We will take this into account when selecting outlier detection techniques and imputation techniques both for outliers and missing values.

Search for outliers

Univariate outlier detection:

Univariate oulier detection with IQR + 1.5 criteria. If we just check in univariate manner, we can see almost every observation has at least one value corresponding to an outlier in at least one variable.

check_outliers(lung_data_complete, method = "iqr")
## Warning: 347 outliers detected (cases 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 187, 188, 189, 190, 191, 192, 193, 194, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 230, 231, 232, 233, 235, 236, 237, 238, 239, 241, 242, 243, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 268, 269, 270, 272, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 301, 304, 305, 306, 307, 308, 309, 311, 312, 313, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379).

Multivariate outlier detection:

To perform outlier detection I will use HDoutliers package that allows performing multivariate outlier detection for mixed data. To be able to work with this algorithm I need to perform imputation of missing values first so I will use missForest package that gives me the possibility to perform missing value imputation, again taking in account the mixed data types we are dealing with.

Pre-processing step: missing values imputation

I perform missing values imputation first:

lung_data.imp <- missForest(lung_data_complete %>% select(.,-Study.Date))

#check imputation error. NRMSE is normalized mean squared error. It is used to represent error derived 
# from imputing continuous values. PFC (proportion of falsely classified) is used to represent error 
# derived from imputing categorical values.
lung_data.imp$OOBerror
##        NRMSE          PFC 
## 4.420432e-12 2.114762e-01
#check new values
lung_data.imp$ximp[rownames(lung_data.imp$ximp) %in% rownames(which(is.na(lung_data_complete), arr.ind=TRUE)),] %>% select(age,gender,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,deadstatus.event,Manufacturer)
##                age gender clinical.T.Stage Clinical.N.Stage Clinical.M.Stage
## LUNG1-058 82.27790   male                2                0                0
## LUNG1-072 71.47430   male                4                3                0
## LUNG1-085 53.65910 female                2                3                0
## LUNG1-149 66.43793   male                3                3                0
## LUNG1-167 74.92950   male                4                0                0
## LUNG1-174 67.30680   male                1                0                0
## LUNG1-232 73.28680   male                4                2                0
## LUNG1-256 53.08420   male                4                2                0
## LUNG1-269 73.05950   male                3                3                0
## LUNG1-270 66.26055   male                1                2                0
## LUNG1-272 60.13960   male                2                2                0
## LUNG1-275 69.21163   male                2                3                0
## LUNG1-292 66.21490   male                4                0                0
## LUNG1-299 67.76281   male                1                0                0
## LUNG1-303 71.70415   male                2                0                0
## LUNG1-307 68.70723 female                1                0                0
## LUNG1-308 64.77866 female                2                1                0
## LUNG1-333 63.69880   male                1                2                0
## LUNG1-339 67.13958   male                4                2                0
## LUNG1-341 69.85387   male                2                0                0
## LUNG1-349 64.12066   male                2                1                0
## LUNG1-354 67.20546 female                1                2                0
## LUNG1-385 63.75625   male                2                0                0
##           Overall.Stage deadstatus.event Manufacturer
## LUNG1-058             I                1      SIEMENS
## LUNG1-072          IIIb                1      SIEMENS
## LUNG1-085          IIIb                1     CMS Inc.
## LUNG1-149          IIIb                1     CMS Inc.
## LUNG1-167          IIIb                1      SIEMENS
## LUNG1-174             I                1      SIEMENS
## LUNG1-232          IIIb                1      SIEMENS
## LUNG1-256          IIIb                1      SIEMENS
## LUNG1-269          IIIa                1      SIEMENS
## LUNG1-270          IIIa                1      SIEMENS
## LUNG1-272          IIIb                1      SIEMENS
## LUNG1-275          IIIb                1      SIEMENS
## LUNG1-292          IIIb                1      SIEMENS
## LUNG1-299          IIIb                1      SIEMENS
## LUNG1-303             I                1      SIEMENS
## LUNG1-307             I                1      SIEMENS
## LUNG1-308            II                1      SIEMENS
## LUNG1-333          IIIa                1      SIEMENS
## LUNG1-339          IIIb                1      SIEMENS
## LUNG1-341             I                1      SIEMENS
## LUNG1-349            II                1      SIEMENS
## LUNG1-354          IIIa                1      SIEMENS
## LUNG1-385            II                1      SIEMENS
# we can see now M stage is in fact 0 for every entry, so we will remove this column
table(lung_data.imp$ximp$Clinical.M.Stage)
## 
##   0 
## 379
# I remove Clinical M Stage from the dataset and I add Study Date again
lung_data_naimp <- lung_data.imp$ximp %>% select(.,-Clinical.M.Stage) 
Multivariate outlier detection:
# Transforms the data according to the specifications in Wilkinson’s hdoutliers algorithm replaces each categorical variables with a numeric variable corresponding to its first component in multiple correspondence analysis, then maps the data to the unit square

lung_data.trans <- dataTrans(lung_data_naimp)

# Implements the first stage of the hdoutliers Algorithm, in which the data is partitioned according to exemplars and their associated lists of members.

lung_data.mem <- getHDmembers(lung_data.trans)

# An exponential distribution is fitted to the upper tail of the nearest-neighbor distances between
# exemplars (the observations considered representatives of each component of member

lung_data.out <- getHDoutliers(lung_data.trans, lung_data.mem, alpha = 0.05)

# Produces a plot of the data (transformed according to the Wilkinson’s specifications) showing the
# outliers. If the data has more than two dimensions, it is plotted onto the principal components of
# the data that remains after removing outliers.

tiff(filename = "figures/Fig12.tiff", width = 200, height = 150, units = "mm", res = 300)
plotHDoutliers(lung_data_naimp,lung_data.out)
fig <- dev.off()
plotHDoutliers(lung_data_naimp,lung_data.out)

# we identify the specific observations classified as outliers:
rownames(lung_data.trans)[c(lung_data.out)]
## [1] "LUNG1-027" "LUNG1-069"

When taking in account all the categorical and numerical variables available on the data set, excluding dates, 2 particular observations are considered outliers, “LUNG1-027” and “LUNG1-069”. Extreme values for a single variable seem to make more sense when they are explained by other variables.

I remove the outliers detected:

# I add study date variable again to the update data set
lung_data_no_na <- lung_data_naimp %>% cbind(Study.Date = lung_data_complete$Study.Date)
lung_data_no_out <- lung_data_no_na[-c(which(rownames(lung_data_no_na) %in% rownames(lung_data.trans)[c(lung_data.out)])),]
dim(lung_data_no_out)
## [1]  377 1256

Feature selection for radiomic variables

# I build a dataframe just with the radiomic features:
radiomic_vars <- lung_data_no_out %>% keep(is.numeric) %>%
  select(.,-c(age,Survival.time))

Removing near cero variance features

To check for near zero variance variables I use nearZeroVar fuction from caret package.

names(radiomic_vars)[nearZeroVar(radiomic_vars)]
## character(0)

No near 0 variance variables

First approach: MRMR

I first use maximum relevance minimum redundance algorithm. As there are so many radiomic features, I want to be sure I preserve most relevant features for the classes of interest while filtering redundant variables. To do so I use MRMR function from praznik package, and mRMR.classic function from mRMRe package.

# I first select MRMR features taking in account histology class:
select1 <- MRMR(radiomic_vars,lung_data_no_out$Histology,k = 20)
names(select1$selection)
##  [1] "original_shape_VoxelVolume"                              
##  [2] "wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis"        
##  [3] "wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis"         
##  [4] "log.sigma.1.0.mm.3D_glszm_LargeAreaHighGrayLevelEmphasis"
##  [5] "log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis" 
##  [6] "wavelet.HHL_glcm_ClusterProminence"                      
##  [7] "wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis"         
##  [8] "wavelet.HHH_firstorder_Kurtosis"                         
##  [9] "log.sigma.5.0.mm.3D_glcm_ClusterProminence"              
## [10] "wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis"         
## [11] "log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity"         
## [12] "wavelet.LLH_glcm_ClusterProminence"                      
## [13] "wavelet.HLL_glcm_Autocorrelation"                        
## [14] "log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis" 
## [15] "wavelet.HHH_glszm_GrayLevelNonUniformity"                
## [16] "original_firstorder_Kurtosis"                            
## [17] "wavelet.HHH_firstorder_Median"                           
## [18] "log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis" 
## [19] "wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis"   
## [20] "wavelet.HHL_firstorder_Kurtosis"
# Then I do the same for the survival target interest
df <- as.data.frame(radiomic_vars %>% cbind(surv1 = Surv(lung_data_no_out$Survival.time, lung_data_no_out$deadstatus.event)))
df$original_shape_VoxelVolume <- as.numeric(df$original_shape_VoxelVolume)
dd <- mRMR.data(data = df)
select2 <- mRMR.classic(data = dd, target_indices = c(grep("surv1", colnames(df))), feature_count = 20)
selected2_features <- select2@feature_names[solutions(select2)$'1247'[,1]]
selected2_features
##  [1] "wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis"
##  [2] "log.sigma.3.0.mm.3D_glszm_ZoneVariance"              
##  [3] "wavelet.HHH_glrlm_GrayLevelVariance"                 
##  [4] "original_gldm_LargeDependenceLowGrayLevelEmphasis"   
##  [5] "wavelet.HLH_glcm_ClusterShade"                       
##  [6] "wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis"    
##  [7] "wavelet.HHH_glcm_Imc2"                               
##  [8] "original_shape_Elongation"                           
##  [9] "wavelet.LHH_glszm_SizeZoneNonUniformity"             
## [10] "log.sigma.2.0.mm.3D_glcm_ClusterShade"               
## [11] "wavelet.HHH_firstorder_Skewness"                     
## [12] "log.sigma.2.0.mm.3D_firstorder_Kurtosis"             
## [13] "wavelet.LLH_glszm_SmallAreaEmphasis"                 
## [14] "wavelet.LLH_glcm_MCC"                                
## [15] "wavelet.HHL_firstorder_Median"                       
## [16] "log.sigma.3.0.mm.3D_glcm_Autocorrelation"            
## [17] "wavelet.HLH_glszm_SmallAreaEmphasis"                 
## [18] "wavelet.HHL_gldm_DependenceNonUniformityNormalized"  
## [19] "wavelet.HHH_glszm_LowGrayLevelZoneEmphasis"          
## [20] "wavelet.HHH_firstorder_Median"
# I check variables matching for both vectors of selected variables
intersect(names(select1$selection),selected2_features)
## [1] "wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis"
## [2] "wavelet.HHH_firstorder_Median"
# Still keep all the variables selected for at least one class taking in account a variable might be highly related to histology class for example and not to survival and so on
mrmr_radiomics <- radiomic_vars %>% dplyr::select(names(select1$selection),all_of(selected2_features))

plot_correlation(mrmr_radiomics, type = "c", theme_config = list(legend.position = "bottom", axis.text.x = element_text(angle =
    90, size = 9), axis.text.y = element_text(size = 9)))

ggsave("figures/Fig13.tiff",width = 260, height = 260, device="tiff", dpi=300, units = "mm", scale = 1)

dim(mrmr_radiomics)
## [1] 377  38

We can see dim(mrmr_radiomics)[2] radiomic features were selected taking in account some minimal overlap between both MRMR selections regarding histology or survival.

Then I eliminate still highly redundant variables using findCorrelation function from caret package.

# As after MRMR I still have blocks of highly correlated variables I add an additional simple correlation filter to eliminate those still highly correlated:
redundant_vars <- findCorrelation(
  cor(mrmr_radiomics),
  cutoff = 0.90,
  names = TRUE
)
filtered_radiomics <- mrmr_radiomics %>% dplyr::select(.,-all_of(redundant_vars))
plot_correlation(filtered_radiomics, type = "c")

ggsave("figures/Fig14.tiff",width = 250, height = 250, device="tiff", dpi=300, units = "mm", scale = 1)

dim(filtered_radiomics)
## [1] 377  35

We are now left with dim(filtered_radiomics)[2] radiomic features.

Now that we have a few selected features, I can deepen radiomic feature description missing on initial global analysis.

Selected radiomic features univariate distribution

(table12 <- filtered_radiomics %>% skim() %>% 
  select(.,-c(skim_type,n_missing,complete_rate, numeric.hist)) %>% 
  kable(full_with = FALSE) %>% kable_classic_2()) 
skim_variable numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100
original_shape_VoxelVolume 7.514392e+04 9.164034e+04 525.0000000 1.122300e+04 4.162500e+04 1.092300e+05 4.986630e+05
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis 2.689784e+08 1.414345e+09 5127.4493243 2.589023e+06 2.406861e+07 1.661301e+08 2.517207e+10
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis 1.921103e+06 1.321737e+07 364.0807895 5.310705e+04 2.203880e+05 7.836464e+05 2.461455e+08
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis 3.088475e+03 1.541558e+04 0.0885386 2.077847e+01 1.257651e+02 8.214749e+02 2.295998e+05
wavelet.HHL_glcm_ClusterProminence 7.665988e+01 1.454862e+02 1.2028982 1.363245e+01 3.749540e+01 8.591309e+01 2.101730e+03
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis 2.827624e+04 1.141490e+05 11.5476591 1.099798e+03 4.079064e+03 1.594923e+04 1.695279e+06
wavelet.HHH_firstorder_Kurtosis 7.149780e+00 6.195689e+00 3.2579746 4.933786e+00 5.814347e+00 7.550008e+00 8.856737e+01
log.sigma.5.0.mm.3D_glcm_ClusterProminence 4.105798e+04 7.980577e+04 286.5581147 9.186272e+03 1.933337e+04 4.394937e+04 1.045926e+06
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis 2.715440e+08 9.921636e+08 22674.7361111 4.524356e+06 2.667024e+07 1.215297e+08 1.383512e+10
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity 1.278862e+02 1.749412e+02 1.5555556 4.291632e+01 9.022238e+01 1.601304e+02 2.602432e+03
wavelet.LLH_glcm_ClusterProminence 5.646281e+02 9.301258e+02 5.2084513 1.451064e+02 3.193128e+02 6.401288e+02 1.385699e+04
wavelet.HLL_glcm_Autocorrelation 8.338607e+02 1.196458e+03 92.2158770 3.548129e+02 5.427982e+02 8.684058e+02 1.543454e+04
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis 2.035400e-03 2.590600e-03 0.0005336 1.087800e-03 1.426400e-03 1.947400e-03 3.004540e-02
wavelet.HHH_glszm_GrayLevelNonUniformity 6.279034e+00 1.078154e+01 1.0000000 1.666667e+00 3.200000e+00 6.111111e+00 1.091316e+02
original_firstorder_Kurtosis 1.292314e+01 2.131993e+01 1.5401904 3.500586e+00 7.569782e+00 1.517709e+01 3.170560e+02
wavelet.HHH_firstorder_Median -7.881000e-04 3.247980e-02 -0.2030416 -5.716700e-03 2.308000e-04 6.394800e-03 1.625301e-01
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis 3.137896e+03 1.247043e+04 0.0501319 3.921805e+01 2.740681e+02 1.506276e+03 1.342141e+05
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis 4.974261e+00 7.463963e+00 0.1724759 1.305744e+00 2.843033e+00 6.837657e+00 1.212130e+02
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis 1.426500e-03 1.429000e-03 0.0000764 5.768000e-04 8.849000e-04 1.698600e-03 8.357500e-03
wavelet.HHH_glrlm_GrayLevelVariance 2.508655e-01 2.168600e-03 0.2494989 2.499973e-01 2.499999e-01 2.507612e-01 2.697846e-01
original_gldm_LargeDependenceLowGrayLevelEmphasis 5.334720e-02 6.543750e-02 0.0096717 2.564420e-02 3.733500e-02 5.494150e-02 6.689996e-01
wavelet.HLH_glcm_ClusterShade 1.763770e-02 1.020537e-01 -1.0279270 -7.478000e-03 6.204800e-03 3.116050e-02 7.888853e-01
wavelet.HHH_glcm_Imc2 1.162806e-01 3.131030e-02 0.0498679 9.233770e-02 1.149520e-01 1.360351e-01 2.191420e-01
original_shape_Elongation 7.257929e-01 1.616055e-01 0.0621314 6.391788e-01 7.449973e-01 8.460198e-01 9.858400e-01
wavelet.LHH_glszm_SizeZoneNonUniformity 9.235505e+01 1.188264e+02 1.0000000 2.317582e+01 5.756831e+01 1.162765e+02 1.148264e+03
log.sigma.2.0.mm.3D_glcm_ClusterShade 2.194476e+02 6.508272e+02 -5500.5484241 -8.059794e+01 5.258937e+01 4.121356e+02 4.597537e+03
wavelet.HHH_firstorder_Skewness -1.931860e-02 7.218790e-02 -0.2874264 -4.908130e-02 -1.137630e-02 1.318320e-02 4.302120e-01
log.sigma.2.0.mm.3D_firstorder_Kurtosis 7.041946e+00 8.070028e+00 1.8922936 3.449192e+00 5.121107e+00 7.916690e+00 1.183450e+02
wavelet.LLH_glszm_SmallAreaEmphasis 4.379758e-01 3.893630e-02 0.2876537 4.139486e-01 4.375262e-01 4.633012e-01 5.831727e-01
wavelet.LLH_glcm_MCC 6.643959e-01 3.587040e-02 0.5432266 6.417585e-01 6.627832e-01 6.897787e-01 7.763111e-01
wavelet.HHL_firstorder_Median -1.944310e-02 3.549587e-01 -1.4443134 -1.289531e-01 -2.335890e-02 5.964460e-02 1.935696e+00
log.sigma.3.0.mm.3D_glcm_Autocorrelation 3.131763e+02 2.927697e+02 77.0505618 1.895635e+02 2.576249e+02 3.514185e+02 3.983002e+03
wavelet.HLH_glszm_SmallAreaEmphasis 4.790389e-01 6.466070e-02 0.2133874 4.446185e-01 4.769316e-01 5.145408e-01 7.057769e-01
wavelet.HHL_gldm_DependenceNonUniformityNormalized 7.134910e-02 1.080340e-02 0.0560091 6.388950e-02 6.776520e-02 7.579840e-02 1.213543e-01
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis 5.229704e-01 1.609709e-01 0.0641208 4.240196e-01 5.000000e-01 6.250000e-01 8.928571e-01
table12 %>% save_kable("tables/table12.pdf")

In this table we can see summary statistics for selected radiomic features, we can see many of these include negative values, somthing to take into account when evaluating data transformation.

I further represent univariate distribution by representing data with histograms:

filtered_radiomics %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free", ncol = 4) +
    geom_histogram() +
    theme(strip.text.x = element_text(size = 6))

ggsave("figures/Fig15.tiff",width = 300, height = 300, device="tiff", dpi=300, units = "mm", scale = 1)

And I further evaluate normality by generating qqplot for these selected features.

filtered_radiomics %>% 
  gather() %>% 
  ggplot(aes(sample=value)) +
    facet_wrap(~ key, scales = "free", ncol = 4) +
    stat_qq() +
    theme(strip.text.x = element_text(size = 6))

ggsave("figures/Fig16.tiff",width = 300, height = 300, device="tiff", dpi=300, units = "mm", scale = 1)

As previously evaluated for the whole dataset with Shapiro Wilk test, we see in both histograms and qqplots that many radiomic variables deviate from normality and have a rather skewed distribution. We should take this into account as data transformation may be needed for optimal performance of different model-based cluster models.

As we have mainly positive skewed distributions I will apply a log transform after adding a mimimum value to avoid negative and ease optimization of different models.

min(filtered_radiomics)
## [1] -5500.548
log_filtered_radiomics <- filtered_radiomics %>%
  lapply(function(x){log(x+5503)}) %>% 
  as.data.frame()

log_filtered_radiomics %>% 
  gather() %>% 
  ggplot(aes(sample=value)) +
    facet_wrap(~ key, scales = "free", ncol = 4) +
    stat_qq() +
    theme(strip.text.x = element_text(size = 6))

ggsave("figures/Fig17.tiff",width = 300, height = 300, device="tiff", dpi=300, units = "mm", scale = 1)

As we can see in the histogram plots still a lot of variables seem to present extreme outliers, I will evaluate this again now with selected variables in order to define if further observations should be excluded to optimize data modeling.

lung_data_filtered_radiomics <- cbind(lung_data_no_out %>%
                                        dplyr::select(.,-c(starts_with('original'),
                                                    starts_with('wavelet'),
                                                    starts_with('log'),
                                                    Study.Date)),
                                      log_filtered_radiomics)

out <- HDoutliers(lung_data_filtered_radiomics)
plotHDoutliers(lung_data_filtered_radiomics,out)

No additional multivariate outliers are detected when relating all the variables.

Selected radiomic features relationship/independency with main variables clinical and dicom variables

We continue with evaluation of selected radiomic feature relationship to pertinent clinical and dicom variables. First I generate boxplots to represent radiomic feature distribution for the different histology classes.

plot_boxplot(log_filtered_radiomics %>% 
               cbind(Histology = lung_data_no_out$Histology)
             , by = "Histology",
             ncol = 5,
             nrow = 7)+
    theme(strip.text.x = element_text(size = 1))

## NULL
ggsave("figures/Fig18.tiff",width = 500, height = 500, device="tiff", dpi=300, units = "mm", scale = 1)

I then evaluate mean and standard deviation of radiomic features within each class and test to verify if any of this relationships might be significant.

radiomics_hist <- log_filtered_radiomics %>% cbind(Histology = lung_data_no_out$Histology)
crostab <- crosstable(num_digits = 3,radiomics_hist, c(colnames(radiomics_hist %>% dplyr::select(.,-Histology))), by = Histology, test = TRUE) 

(table13 <- crostab %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable == 'Mean (std)') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2()) 
label variable adenocarcinoma large cell nos squamous cell carcinoma pval test
wavelet.HHH_firstorder_Kurtosis Mean (std) 8.615 (0.002) 8.614 (4e-04) 8.614 (3e-04) 8.614 (0.001) 0.0024 (Kruskal-Wallis rank sum test)
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis Mean (std) 8.614 (0.003) 8.614 (0.001) 8.614 (0.001) 8.614 (0.001) 0.0032 (Kruskal-Wallis rank sum test)
wavelet.HHH_glcm_Imc2 Mean (std) 8.613 (5e-06) 8.613 (6e-06) 8.613 (5e-06) 8.613 (6e-06) 0.0202 (One-way analysis of means)
original_gldm_LargeDependenceLowGrayLevelEmphasis Mean (std) 8.613 (4e-06) 8.613 (1e-05) 8.613 (2e-05) 8.613 (1e-05) 0.0264 (Kruskal-Wallis rank sum test)
wavelet.HHL_glcm_ClusterProminence Mean (std) 8.629 (0.046) 8.628 (0.017) 8.625 (0.016) 8.626 (0.020) 0.0348 (Kruskal-Wallis rank sum test)
wavelet.HHH_glszm_GrayLevelNonUniformity Mean (std) 8.614 (0.001) 8.614 (0.002) 8.614 (0.001) 8.614 (0.002) 0.0529 (Kruskal-Wallis rank sum test)
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 9.634 (1.058) 9.384 (0.926) 9.406 (0.960) 9.649 (1.061) 0.0667 (Kruskal-Wallis rank sum test)
wavelet.HLH_glszm_SmallAreaEmphasis Mean (std) 8.613 (1e-05) 8.613 (1e-05) 8.613 (1e-05) 8.613 (1e-05) 0.1037 (Kruskal-Wallis rank sum test)
wavelet.HLL_glcm_Autocorrelation Mean (std) 8.749 (0.219) 8.743 (0.112) 8.725 (0.081) 8.749 (0.128) 0.1518 (Kruskal-Wallis rank sum test)
log.sigma.5.0.mm.3D_glcm_ClusterProminence Mean (std) 10.094 (0.780) 10.412 (0.968) 10.263 (0.894) 10.168 (0.836) 0.1974 (Kruskal-Wallis rank sum test)
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis Mean (std) 8.613 (3e-07) 8.613 (2e-07) 8.613 (3e-07) 8.613 (3e-07) 0.2246 (Kruskal-Wallis rank sum test)
wavelet.HHL_firstorder_Median Mean (std) 8.613 (7e-05) 8.613 (6e-05) 8.613 (7e-05) 8.613 (6e-05) 0.2449 (Kruskal-Wallis rank sum test)
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis Mean (std) 8.613 (4e-07) 8.613 (6e-07) 8.613 (4e-07) 8.613 (4e-07) 0.2504 (Kruskal-Wallis rank sum test)
wavelet.HHH_glrlm_GrayLevelVariance Mean (std) 8.613 (5e-07) 8.613 (4e-07) 8.613 (2e-07) 8.613 (4e-07) 0.2819 (Kruskal-Wallis rank sum test)
wavelet.HLH_glcm_ClusterShade Mean (std) 8.613 (2e-05) 8.613 (2e-05) 8.613 (2e-05) 8.613 (1e-05) 0.2882 (Kruskal-Wallis rank sum test)
log.sigma.2.0.mm.3D_glcm_ClusterShade Mean (std) 8.632 (0.065) 8.657 (0.087) 8.647 (0.097) 8.602 (0.639) 0.3796 (Kruskal-Wallis rank sum test)
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity Mean (std) 8.633 (0.019) 8.638 (0.023) 8.634 (0.026) 8.635 (0.034) 0.4096 (Kruskal-Wallis rank sum test)
wavelet.LHH_glszm_SizeZoneNonUniformity Mean (std) 8.628 (0.018) 8.632 (0.026) 8.628 (0.020) 8.629 (0.017) 0.4396 (Kruskal-Wallis rank sum test)
wavelet.HHH_firstorder_Skewness Mean (std) 8.613 (1e-05) 8.613 (1e-05) 8.613 (1e-05) 8.613 (1e-05) 0.4447 (Kruskal-Wallis rank sum test)
log.sigma.3.0.mm.3D_glcm_Autocorrelation Mean (std) 8.661 (0.031) 8.668 (0.051) 8.670 (0.043) 8.668 (0.041) 0.4779 (Kruskal-Wallis rank sum test)
original_shape_Elongation Mean (std) 8.613 (3e-05) 8.613 (2e-05) 8.613 (3e-05) 8.613 (3e-05) 0.4918 (Kruskal-Wallis rank sum test)
wavelet.LLH_glcm_MCC Mean (std) 8.613 (7e-06) 8.613 (6e-06) 8.613 (6e-06) 8.613 (6e-06) 0.4957 (One-way analysis of means)
wavelet.HHL_gldm_DependenceNonUniformityNormalized Mean (std) 8.613 (2e-06) 8.613 (2e-06) 8.613 (2e-06) 8.613 (2e-06) 0.5369 (Kruskal-Wallis rank sum test)
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis Mean (std) 16.450 (3.198) 16.554 (2.861) 16.224 (3.279) 16.828 (2.872) 0.5620 (Kruskal-Wallis rank sum test)
wavelet.LLH_glcm_ClusterProminence Mean (std) 8.709 (0.181) 8.700 (0.095) 8.716 (0.120) 8.699 (0.085) 0.6022 (Kruskal-Wallis rank sum test)
log.sigma.2.0.mm.3D_firstorder_Kurtosis Mean (std) 8.614 (0.001) 8.614 (0.001) 8.615 (0.002) 8.614 (0.002) 0.6423 (Kruskal-Wallis rank sum test)
original_firstorder_Kurtosis Mean (std) 8.616 (0.003) 8.615 (0.002) 8.616 (0.004) 8.616 (0.005) 0.7704 (Kruskal-Wallis rank sum test)
wavelet.LLH_glszm_SmallAreaEmphasis Mean (std) 8.613 (8e-06) 8.613 (7e-06) 8.613 (7e-06) 8.613 (6e-06) 0.8683 (Kruskal-Wallis rank sum test)
original_shape_VoxelVolume Mean (std) 10.691 (1.206) 10.732 (1.106) 10.609 (1.217) 10.740 (1.089) 0.8789 (Kruskal-Wallis rank sum test)
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis Mean (std) 8.613 (3e-05) 8.613 (3e-05) 8.613 (3e-05) 8.613 (3e-05) 0.8955 (Kruskal-Wallis rank sum test)
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 12.313 (2.077) 12.204 (1.935) 12.279 (2.157) 12.388 (1.828) 0.8993 (One-way analysis of means)
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 8.877 (0.529) 8.772 (0.365) 8.874 (0.610) 8.793 (0.465) 0.9263 (Kruskal-Wallis rank sum test)
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 16.933 (2.885) 16.883 (2.529) 16.917 (2.667) 17.047 (2.423) 0.9504 (Kruskal-Wallis rank sum test)
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 8.919 (0.524) 8.811 (0.387) 8.924 (0.618) 8.823 (0.433) 0.9802 (Kruskal-Wallis rank sum test)
wavelet.HHH_firstorder_Median Mean (std) 8.613 (8e-06) 8.613 (7e-06) 8.613 (7e-06) 8.613 (4e-06) 0.9838 (Kruskal-Wallis rank sum test)
table13 %>% save_kable('tables/table13.pdf')

Now I will evaluate potential relationship between individual radiomic features and survival. To do so I adjust a cox model for each one.

radiomics_surv <- log_filtered_radiomics %>% cbind(Survival.time = lung_data_no_out$Survival.time, deadstatus.event = lung_data_no_out$deadstatus.event) 

surv_formulas <- sapply(colnames(log_filtered_radiomics),function(x) as.formula(paste('Surv(Survival.time,as.numeric(deadstatus.event))~', x)))
surv_models <- lapply(surv_formulas, function(x){coxph(x, data = radiomics_surv)})

surv_rad_vars <- lapply(surv_models,
                       function(x){ 
                          x <- summary(x)
                          p.value<-format(signif(x$wald["pvalue"], digits=2),scientific = FALSE)
                          wald.test<-signif(x$wald["test"], digits=2)
                          beta<-signif(x$coef[1], digits=2);#coeficient beta
                          res<-c(beta, wald.test, p.value)
                          names(res)<-c("beta", "wald.test", 
                                        "p.value")
                          return(res)
                          #return(exp(cbind(coef(x),confint(x))))
                         })
res <- t(as.data.frame(surv_rad_vars, check.names = FALSE))
(table14 <- as.data.frame(res) %>% arrange(p.value) %>% 
  kable(full_with = FALSE) %>% kable_classic_2())
beta wald.test p.value
wavelet.LHH_glszm_SizeZoneNonUniformity 11 18 0.00002
wavelet.HHH_glszm_GrayLevelNonUniformity 100 18 0.000023
log.sigma.2.0.mm.3D_firstorder_Kurtosis 180 17 0.000031
original_firstorder_Kurtosis 63 13 0.00032
original_shape_VoxelVolume 0.18 13 0.00036
original_gldm_LargeDependenceLowGrayLevelEmphasis 14000 12 0.00062
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis 0.065 11 0.0011
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis 0.34 10 0.0012
wavelet.HHH_firstorder_Kurtosis 120 9.6 0.0019
wavelet.HLL_glcm_Autocorrelation 1.1 9.6 0.0019
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis 0.32 9.6 0.002
log.sigma.3.0.mm.3D_glcm_Autocorrelation 3.1 8.2 0.0041
wavelet.HLH_glcm_ClusterShade -8600 5.3 0.022
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis 0.061 4.6 0.031
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity 3 4 0.045
wavelet.HHH_glcm_Imc2 19000 3.9 0.048
wavelet.LLH_glszm_SmallAreaEmphasis 15000 3.8 0.052
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis -440000 3.2 0.073
wavelet.HHH_glrlm_GrayLevelVariance 210000 2.6 0.11
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis 0.078 2.1 0.15
wavelet.HHL_gldm_DependenceNonUniformityNormalized 36000 1.8 0.18
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis -2500 1.8 0.18
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis 0.028 1.6 0.2
wavelet.LLH_glcm_ClusterProminence -0.66 1.4 0.23
wavelet.HHH_firstorder_Skewness -5200 1.4 0.24
wavelet.HHL_firstorder_Median -930 1.3 0.25
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis 34 1.3 0.26
original_shape_Elongation -2000 1.3 0.26
log.sigma.5.0.mm.3D_glcm_ClusterProminence -0.07 1.2 0.27
wavelet.HHL_glcm_ClusterProminence 1.5 0.58 0.44
wavelet.HHH_firstorder_Median 3600 0.17 0.68
wavelet.HLH_glszm_SmallAreaEmphasis 940 0.04 0.84
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis 21000 0.03 0.87
wavelet.LLH_glcm_MCC 1200 0.02 0.88
log.sigma.2.0.mm.3D_glcm_ClusterShade 0.012 0.01 0.92
table14 %>% save_kable('tables/table14.pdf')

It is also important to evaluate the possible relationship between selected radiomic features and scanner manufacturer.

plot_boxplot(log_filtered_radiomics %>% 
               cbind(Manufacturer = lung_data_no_out$Manufacturer)
             , by = "Manufacturer",
             ncol = 5,
             nrow = 7)+
    theme(strip.text.x = element_text(size = 1))

## NULL
ggsave("figures/Fig19.tiff",width = 500, height = 500, device="tiff", dpi=300, units = "mm", scale = 1)

radiomics_manuf <- log_filtered_radiomics %>% cbind(Manufacturer = lung_data_no_out$Manufacturer)
crostab <- crosstable(num_digits = 3,radiomics_manuf, c(colnames(radiomics_manuf  %>% dplyr::select(.,-Manufacturer))), by = Manufacturer, test = TRUE) 

manuf_rad_vars <- crostab %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable == 'Mean (std)') %>% 
            dplyr::select(.,-c(string,.id))
(table15 <- manuf_rad_vars %>% kable(full_with = FALSE) %>% kable_classic_2()) 
label variable CMS Inc.  SIEMENS pval test
wavelet.HHL_glcm_ClusterProminence Mean (std) 8.630 (0.018) 8.626 (0.025) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis Mean (std) 8.614 (0.001) 8.614 (0.001) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glrlm_GrayLevelVariance Mean (std) 8.613 (5e-07) 8.613 (4e-07) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glcm_Imc2 Mean (std) 8.613 (5e-06) 8.613 (6e-06) <0.0001 (Wilcoxon rank sum test)
wavelet.LHH_glszm_SizeZoneNonUniformity Mean (std) 8.634 (0.019) 8.628 (0.021) 0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis Mean (std) 8.613 (3e-05) 8.613 (3e-05) 0.0003 (Wilcoxon rank sum test)
wavelet.HLL_glcm_Autocorrelation Mean (std) 8.776 (0.163) 8.734 (0.122) 0.0005 (Wilcoxon rank sum test)
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis Mean (std) 8.613 (2e-07) 8.613 (5e-07) 0.0016 (Wilcoxon rank sum test)
original_shape_VoxelVolume Mean (std) 10.963 (1.098) 10.637 (1.128) 0.0195 (Wilcoxon rank sum test)
wavelet.HHH_glszm_GrayLevelNonUniformity Mean (std) 8.615 (0.003) 8.614 (0.002) 0.0219 (Wilcoxon rank sum test)
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis Mean (std) 8.613 (2e-07) 8.613 (3e-07) 0.0369 (Wilcoxon rank sum test)
wavelet.HHH_firstorder_Skewness Mean (std) 8.613 (1e-05) 8.613 (1e-05) 0.0726 (Wilcoxon rank sum test)
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 9.288 (0.749) 9.596 (1.063) 0.0793 (Wilcoxon rank sum test)
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis Mean (std) 17.178 (2.718) 16.429 (3.034) 0.1017 (Wilcoxon rank sum test)
wavelet.HHH_firstorder_Kurtosis Mean (std) 8.614 (0.001) 8.614 (0.001) 0.1580 (Wilcoxon rank sum test)
log.sigma.2.0.mm.3D_firstorder_Kurtosis Mean (std) 8.614 (0.001) 8.614 (0.001) 0.2335 (Wilcoxon rank sum test)
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 8.806 (0.450) 8.813 (0.481) 0.2771 (Wilcoxon rank sum test)
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 8.855 (0.489) 8.847 (0.463) 0.2956 (Wilcoxon rank sum test)
wavelet.HLH_glcm_ClusterShade Mean (std) 8.613 (2e-05) 8.613 (2e-05) 0.3116 (Wilcoxon rank sum test)
wavelet.LLH_glcm_ClusterProminence Mean (std) 8.715 (0.154) 8.700 (0.095) 0.3198 (Wilcoxon rank sum test)
wavelet.LLH_glcm_MCC Mean (std) 8.613 (6e-06) 8.613 (7e-06) 0.4597 (Welch Two Sample t-test)
wavelet.LLH_glszm_SmallAreaEmphasis Mean (std) 8.613 (7e-06) 8.613 (7e-06) 0.5130 (Wilcoxon rank sum test)
log.sigma.3.0.mm.3D_glcm_Autocorrelation Mean (std) 8.676 (0.066) 8.665 (0.034) 0.5701 (Wilcoxon rank sum test)
wavelet.HHH_firstorder_Median Mean (std) 8.613 (6e-06) 8.613 (6e-06) 0.5872 (Wilcoxon rank sum test)
wavelet.HLH_glszm_SmallAreaEmphasis Mean (std) 8.613 (1e-05) 8.613 (1e-05) 0.5903 (Wilcoxon rank sum test)
original_shape_Elongation Mean (std) 8.613 (3e-05) 8.613 (3e-05) 0.6237 (Wilcoxon rank sum test)
original_firstorder_Kurtosis Mean (std) 8.615 (0.003) 8.615 (0.004) 0.6415 (Wilcoxon rank sum test)
log.sigma.2.0.mm.3D_glcm_ClusterShade Mean (std) 8.547 (0.848) 8.653 (0.096) 0.6521 (Wilcoxon rank sum test)
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity Mean (std) 8.635 (0.020) 8.636 (0.030) 0.7935 (Wilcoxon rank sum test)
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 16.946 (2.378) 16.965 (2.603) 0.8345 (Wilcoxon rank sum test)
original_gldm_LargeDependenceLowGrayLevelEmphasis Mean (std) 8.613 (1e-05) 8.613 (1e-05) 0.8407 (Wilcoxon rank sum test)
wavelet.HHL_firstorder_Median Mean (std) 8.613 (4e-05) 8.613 (7e-05) 0.8916 (Wilcoxon rank sum test)
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 12.322 (1.679) 12.299 (2.016) 0.9169 (Welch Two Sample t-test)
wavelet.HHL_gldm_DependenceNonUniformityNormalized Mean (std) 8.613 (2e-06) 8.613 (2e-06) 0.9439 (Wilcoxon rank sum test)
log.sigma.5.0.mm.3D_glcm_ClusterProminence Mean (std) 10.199 (0.725) 10.261 (0.926) 0.9638 (Wilcoxon rank sum test)
table15 %>% save_kable('tables/table15.pdf')

Seems pertinent to exclude from the model those features that seem highly correlated to scanner manufacturer, as this could add additional bias to the model. I will not correct alfa as I prefer to be sure to exclude any possibly related feature.

out_manuf_vars <- manuf_rad_vars %>%
  separate(pval,into = c('pval'),sep = "<",convert = TRUE) %>% 
  filter(pval < 0.05) 

log_filtered_stable_radiomics <- log_filtered_radiomics %>% select(.,-c(out_manuf_vars$label))

I will still have to use Manufacturer variable in the model, given the previous proven lack of independence with mean survival.

We are now left with 28 radiomic features.

Second option PCA dimensionality reduction

As a second option I will try reducing dimensionality using PCA, an unsupervised approach.

I remove first highly correlated variables to ease further PCA analysis.

# I eliminate blocks of highly correlated variables by using a simple correlation filter to eliminate those still highly correlated:
redundant_vars <- findCorrelation(
  cor(radiomic_vars),
  cutoff = 0.95,
  names = TRUE
)
non_redund_radiomics <- radiomic_vars %>% select(.,-all_of(redundant_vars))
dim(non_redund_radiomics)
## [1] 377 458

Given the known skewed distribution of radiomic vars and negative values I will also transform data before performing PCA.

min(non_redund_radiomics)
## [1] -1963907
log_non_redund_radiomics <- non_redund_radiomics %>%
  lapply(function(x){log(x+1963908)}) %>% 
  as.data.frame()

To perform PCA I will use prcomp function from R stats package. I continue with PCA where I will select the 40 first components to work with. I will also scale data for PCA analysis to avoid the different weights coming from the difference in scale of each radiomics variable.

pca_radiomics <- prcomp(log_non_redund_radiomics, scale = TRUE)
pca_rad <- pca_radiomics$x[,1:40]

pca_radiomics$rotation[1:10,1:10]
##                                                PC1          PC2          PC3
## original_shape_Elongation              -0.01085453 -0.001876510 -0.010337314
## original_shape_Flatness                -0.01020703 -0.003981814 -0.025579199
## original_shape_LeastAxisLength         -0.08157157  0.011077081  0.004876149
## original_shape_MajorAxisLength         -0.05531321  0.009947248  0.019009331
## original_shape_Maximum2DDiameterColumn -0.07709468  0.013075388  0.019250751
## original_shape_Maximum2DDiameterRow    -0.07832697  0.015760610  0.015821557
## original_shape_Maximum2DDiameterSlice  -0.07668676  0.015624930  0.011927022
## original_shape_Maximum3DDiameter       -0.07297853  0.014118633  0.018206044
## original_shape_MinorAxisLength         -0.08054202  0.011756375  0.015704073
## original_shape_Sphericity               0.03786741 -0.028574811 -0.021900235
##                                                  PC4          PC5          PC6
## original_shape_Elongation               0.0192771873  0.007191065 -0.008802822
## original_shape_Flatness                 0.0138949805 -0.006764446 -0.012382370
## original_shape_LeastAxisLength          0.0209963693 -0.070482058 -0.003981804
## original_shape_MajorAxisLength          0.0026898472 -0.065055495  0.002465903
## original_shape_Maximum2DDiameterColumn  0.0156743670 -0.066658283 -0.002269238
## original_shape_Maximum2DDiameterRow     0.0073379759 -0.059492717 -0.007739259
## original_shape_Maximum2DDiameterSlice   0.0091178327 -0.054119097 -0.003913664
## original_shape_Maximum3DDiameter        0.0009487841 -0.064692795 -0.001120063
## original_shape_MinorAxisLength          0.0186342969 -0.065927916 -0.002216488
## original_shape_Sphericity              -0.0134134103 -0.007157391  0.013279513
##                                                 PC7          PC8          PC9
## original_shape_Elongation              -0.011558531 -0.031158237 -0.007457587
## original_shape_Flatness                 0.001093249 -0.004942720 -0.011557323
## original_shape_LeastAxisLength         -0.020672693  0.013436593  0.047571928
## original_shape_MajorAxisLength         -0.023584735  0.040663116  0.052984195
## original_shape_Maximum2DDiameterColumn -0.018289843  0.008133012  0.043383128
## original_shape_Maximum2DDiameterRow    -0.024042792  0.024137343  0.048924650
## original_shape_Maximum2DDiameterSlice  -0.038994344  0.013085841  0.053820951
## original_shape_Maximum3DDiameter       -0.024755016  0.029011059  0.051996212
## original_shape_MinorAxisLength         -0.030597585  0.006430111  0.050576637
## original_shape_Sphericity              -0.006038948 -0.025461216 -0.033898330
##                                                PC10
## original_shape_Elongation              -0.004491675
## original_shape_Flatness                 0.014272809
## original_shape_LeastAxisLength          0.025528096
## original_shape_MajorAxisLength          0.017188711
## original_shape_Maximum2DDiameterColumn  0.023891701
## original_shape_Maximum2DDiameterRow     0.010627343
## original_shape_Maximum2DDiameterSlice   0.018674765
## original_shape_Maximum3DDiameter        0.018754522
## original_shape_MinorAxisLength          0.015064998
## original_shape_Sphericity              -0.026471707

When evaluating the weights we can see most radiomic features applied to the original images contributed the most to first PCA components, much different to what we found as relevant when filtering variables with MRMR method

I evaluate the relationship of the different components to the variables of interest:

plot_boxplot(as.data.frame(pca_rad) %>% 
               cbind(Histology = lung_data_no_out$Histology)
             , by = "Histology",
             ncol = 5,
             nrow = 7)+
    theme(strip.text.x = element_text(size = 1))

## NULL
ggsave("figures/Fig20.tiff",width = 500, height = 500, device="tiff", dpi=300, units = "mm", scale = 1)


sum_pca <- summary(pca_radiomics)

pca_radiomics_hist <- as.data.frame(pca_rad) %>% cbind(Histology = lung_data_no_out$Histology)
crostab <- crosstable(num_digits = 3,pca_radiomics_hist, c(colnames(pca_radiomics_hist %>% select(.,-Histology))), by = Histology, test = TRUE) 

(table16 <- crostab %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable == 'Mean (std)') %>% 
            select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2()) 
label variable adenocarcinoma large cell nos squamous cell carcinoma pval test
PC4 Mean (std) -0.479 (6.230) 1.344 (4.878) 0.752 (3.811) -1.156 (4.798) 0.0002 (Kruskal-Wallis rank sum test)
PC17 Mean (std) 0.023 (2.706) 0.358 (1.902) -0.633 (1.772) -0.023 (1.844) 0.0018 (Kruskal-Wallis rank sum test)
PC23 Mean (std) -0.104 (1.511) -0.123 (1.617) -0.293 (1.597) 0.246 (1.643) 0.0148 (Kruskal-Wallis rank sum test)
PC14 Mean (std) -0.504 (1.690) -0.255 (2.092) 0.194 (2.779) 0.285 (2.151) 0.0348 (One-way analysis of means (not assuming equal variances))
PC9 Mean (std) -0.162 (4.366) -0.283 (3.068) -0.101 (2.500) 0.309 (2.781) 0.0617 (Kruskal-Wallis rank sum test)
PC32 Mean (std) -0.166 (1.512) -0.175 (1.268) 0.039 (1.356) 0.172 (1.260) 0.0656 (Kruskal-Wallis rank sum test)
PC7 Mean (std) -1.184 (2.997) 0.290 (3.398) 0.456 (3.838) -0.003 (3.731) 0.0714 (Kruskal-Wallis rank sum test)
PC13 Mean (std) -0.156 (1.921) -0.400 (2.129) 0.407 (2.471) 0.190 (2.394) 0.0716 (Kruskal-Wallis rank sum test)
PC38 Mean (std) 0.195 (1.212) -0.043 (1.269) 0.250 (1.041) -0.135 (1.088) 0.0722 (Kruskal-Wallis rank sum test)
PC3 Mean (std) 1.073 (7.567) 0.135 (4.675) 0.669 (4.949) -0.734 (5.790) 0.0918 (Kruskal-Wallis rank sum test)
PC8 Mean (std) -0.132 (3.472) 0.201 (3.225) 0.724 (2.738) -0.400 (3.409) 0.0992 (Kruskal-Wallis rank sum test)
PC2 Mean (std) -2.073 (10.065) 1.140 (9.420) -0.626 (8.224) 0.093 (9.400) 0.1011 (Kruskal-Wallis rank sum test)
PC21 Mean (std) 0.264 (2.036) -0.294 (1.577) -0.025 (1.904) 0.143 (1.600) 0.1221 (Kruskal-Wallis rank sum test)
PC24 Mean (std) 0.047 (1.560) 0.218 (1.504) -0.295 (1.728) -0.061 (1.558) 0.1499 (Kruskal-Wallis rank sum test)
PC16 Mean (std) 0.698 (2.773) -0.225 (1.764) 0.296 (1.682) -0.185 (2.064) 0.1748 (Kruskal-Wallis rank sum test)
PC11 Mean (std) 0.544 (2.607) -0.010 (2.404) -0.460 (2.275) 0.009 (2.570) 0.2173 (Kruskal-Wallis rank sum test)
PC15 Mean (std) -0.098 (2.383) -0.179 (1.810) -0.175 (1.599) 0.239 (2.288) 0.2351 (Kruskal-Wallis rank sum test)
PC34 Mean (std) 0.333 (1.409) -0.155 (1.331) -0.087 (1.082) 0.040 (1.193) 0.2656 (Kruskal-Wallis rank sum test)
PC33 Mean (std) 0.309 (1.814) -0.056 (1.260) -0.203 (1.186) 0.020 (1.152) 0.2709 (Kruskal-Wallis rank sum test)
PC39 Mean (std) -0.143 (0.997) -0.055 (0.994) 0.255 (1.300) -0.013 (1.201) 0.2904 (Kruskal-Wallis rank sum test)
PC10 Mean (std) 0.247 (3.166) -0.223 (2.598) -0.403 (3.080) 0.247 (2.781) 0.3254 (One-way analysis of means)
PC12 Mean (std) 0.236 (2.516) -0.060 (2.397) -0.427 (2.580) 0.138 (2.232) 0.3775 (Kruskal-Wallis rank sum test)
PC31 Mean (std) -0.143 (1.622) 0.032 (1.101) -0.077 (1.255) 0.055 (1.429) 0.3873 (Kruskal-Wallis rank sum test)
PC18 Mean (std) 0.103 (1.775) 0.151 (1.914) -0.299 (1.902) -0.028 (1.899) 0.3933 (Kruskal-Wallis rank sum test)
PC35 Mean (std) 0.096 (1.447) -0.111 (1.173) 0.007 (1.178) 0.049 (1.189) 0.4708 (Kruskal-Wallis rank sum test)
PC29 Mean (std) 0.091 (1.403) 0.022 (1.300) 0.038 (1.612) -0.063 (1.333) 0.5311 (Kruskal-Wallis rank sum test)
PC28 Mean (std) 0.023 (1.618) -0.141 (1.371) 0.176 (1.506) 0.028 (1.385) 0.5771 (Kruskal-Wallis rank sum test)
PC36 Mean (std) -0.045 (1.119) 0.112 (1.038) -0.072 (1.310) -0.040 (1.293) 0.6276 (Kruskal-Wallis rank sum test)
PC19 Mean (std) -0.030 (2.529) 0.016 (1.576) -0.016 (1.604) 0.004 (1.745) 0.6390 (Kruskal-Wallis rank sum test)
PC5 Mean (std) 0.375 (5.204) -0.194 (3.701) -0.632 (3.963) 0.275 (4.390) 0.6620 (Kruskal-Wallis rank sum test)
PC20 Mean (std) -0.060 (2.278) 0.053 (1.527) -0.169 (1.825) 0.048 (1.762) 0.7396 (Kruskal-Wallis rank sum test)
PC37 Mean (std) -0.125 (1.174) -0.014 (1.184) 0.015 (1.063) 0.047 (1.260) 0.7597 (Kruskal-Wallis rank sum test)
PC25 Mean (std) -0.102 (1.316) -0.089 (1.602) 0.019 (1.468) 0.094 (1.595) 0.7687 (Kruskal-Wallis rank sum test)
PC22 Mean (std) 0.023 (1.478) 0.102 (1.664) 0.133 (1.603) -0.138 (1.769) 0.7737 (Kruskal-Wallis rank sum test)
PC26 Mean (std) 0.185 (1.514) -0.020 (1.326) -0.083 (1.338) -0.014 (1.642) 0.7854 (Kruskal-Wallis rank sum test)
PC40 Mean (std) 0.170 (1.156) -0.027 (1.087) 0.042 (1.115) -0.054 (1.143) 0.8037 (Kruskal-Wallis rank sum test)
PC1 Mean (std) 0.108 (12.048) 0.203 (10.287) 0.754 (11.679) -0.494 (10.196) 0.8710 (Kruskal-Wallis rank sum test)
PC6 Mean (std) 0.116 (3.338) -0.254 (3.687) 0.356 (4.900) 0.009 (3.831) 0.8829 (Kruskal-Wallis rank sum test)
PC30 Mean (std) 0.150 (1.605) 0.064 (1.303) -0.002 (1.404) -0.098 (1.299) 0.8921 (Kruskal-Wallis rank sum test)
PC27 Mean (std) -0.064 (1.724) 0.042 (1.251) 0.029 (1.481) -0.022 (1.492) 0.9498 (Kruskal-Wallis rank sum test)
table16 %>% save_kable('tables/table16.pdf')
radiomics_pca_surv <- as.data.frame(pca_rad) %>% cbind(Survival.time = lung_data_no_out$Survival.time, deadstatus.event = lung_data_no_out$deadstatus.event) 


surv_formulas <- sapply(colnames(as.data.frame(pca_rad)),function(x) as.formula(paste('Surv(Survival.time,as.numeric(deadstatus.event))~', x)))
surv_models <- lapply(surv_formulas, function(x){coxph(x, data = radiomics_pca_surv)})

surv_rad_pca_vars <- lapply(surv_models,
                       function(x){ 
                          x <- summary(x)
                          p.value<-format(signif(x$wald["pvalue"], digits=2),scientific = FALSE)
                          wald.test<-signif(x$wald["test"], digits=2)
                          beta<-signif(x$coef[1], digits=2);#coeficient beta
                          res<-c(beta, wald.test, p.value)
                          names(res)<-c("beta", "wald.test", 
                                        "p.value")
                          return(res)
                          #return(exp(cbind(coef(x),confint(x))))
                         })
res_pca <- t(as.data.frame(surv_rad_pca_vars, check.names = FALSE))
(table17 <- as.data.frame(res_pca) %>% arrange(p.value) %>% kable(full_with = FALSE) %>% kable_classic_2() )
beta wald.test p.value
PC1 -0.018 10 0.0012
PC3 0.024 6.3 0.012
PC11 0.053 4.9 0.027
PC21 0.069 4.4 0.035
PC28 0.087 4.3 0.038
PC27 -0.085 4.2 0.04
PC32 0.082 3.4 0.067
PC23 0.058 2.7 0.098
PC25 0.058 2.7 0.098
PC30 -0.069 2.6 0.1
PC18 -0.045 2.4 0.12
PC15 0.042 2.3 0.13
PC13 -0.036 1.8 0.17
PC10 0.022 1.2 0.27
PC2 0.0067 1.1 0.29
PC4 -0.013 1.1 0.29
PC7 0.015 0.94 0.33
PC19 0.031 0.91 0.34
PC31 0.04 0.86 0.35
PC6 0.012 0.7 0.4
PC26 0.031 0.65 0.42
PC36 0.038 0.56 0.45
PC33 0.033 0.54 0.46
PC14 0.016 0.4 0.53
PC8 0.0099 0.28 0.6
PC9 0.0085 0.24 0.63
PC37 0.022 0.21 0.64
PC12 -0.01 0.2 0.65
PC20 -0.016 0.21 0.65
PC24 0.017 0.18 0.67
PC34 -0.02 0.19 0.67
PC40 -0.019 0.14 0.71
PC5 -0.0045 0.11 0.74
PC17 -0.0065 0.06 0.81
PC22 0.0083 0.06 0.81
PC29 -0.01 0.06 0.81
PC35 0.011 0.06 0.81
PC38 -0.012 0.06 0.81
PC39 -0.012 0.06 0.81
PC16 0.0055 0.04 0.85
table17 %>% save_kable('tables/table17.pdf')
plot_boxplot(as.data.frame(pca_rad) %>% 
               cbind(Manufacturer = lung_data_no_out$Manufacturer)
             , by = "Manufacturer",
             ncol = 5,
             nrow = 7)+
    theme(strip.text.x = element_text(size = 1))

## NULL
ggsave("figures/Fig21.tiff",width = 500, height = 500, device="tiff", dpi=300, units = "mm", scale = 1)

radiomics_manuf <- as.data.frame(pca_rad) %>% cbind(Manufacturer = lung_data_no_out$Manufacturer)
crostab <- crosstable(num_digits = 3,radiomics_manuf, c(colnames(radiomics_manuf  %>% dplyr::select(.,-Manufacturer))), by = Manufacturer, test = TRUE) 

manuf_rad_vars <- crostab %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable == 'Mean (std)') %>% 
            dplyr::select(.,-c(string,.id))
(table18 <- manuf_rad_vars %>% kable(full_with = FALSE) %>% kable_classic_2()) 
label variable CMS Inc.  SIEMENS pval test
PC11 Mean (std) 0.686 (2.660) -0.197 (2.402) <0.0001 (Wilcoxon rank sum test)
PC12 Mean (std) -1.011 (1.870) 0.290 (2.433) <0.0001 (Wilcoxon rank sum test)
PC2 Mean (std) 2.494 (8.718) -0.715 (9.403) 0.0014 (Wilcoxon rank sum test)
PC9 Mean (std) 0.828 (2.898) -0.237 (3.100) 0.0019 (Wilcoxon rank sum test)
PC4 Mean (std) 1.441 (5.228) -0.413 (4.866) 0.0023 (Wilcoxon rank sum test)
PC8 Mean (std) 0.777 (3.097) -0.223 (3.296) 0.0029 (Wilcoxon rank sum test)
PC30 Mean (std) -0.351 (1.456) 0.101 (1.316) 0.0041 (Wilcoxon rank sum test)
PC20 Mean (std) 0.366 (1.950) -0.105 (1.717) 0.0044 (Wilcoxon rank sum test)
PC40 Mean (std) 0.262 (1.162) -0.075 (1.100) 0.0054 (Wilcoxon rank sum test)
PC18 Mean (std) -0.477 (1.884) 0.137 (1.868) 0.0086 (Wilcoxon rank sum test)
PC29 Mean (std) 0.295 (1.139) -0.085 (1.429) 0.0088 (Wilcoxon rank sum test)
PC5 Mean (std) 1.075 (3.897) -0.308 (4.299) 0.0109 (Wilcoxon rank sum test)
PC10 Mean (std) -0.590 (2.646) 0.169 (2.868) 0.0145 (Wilcoxon rank sum test)
PC14 Mean (std) -0.472 (2.616) 0.135 (2.059) 0.0201 (Wilcoxon rank sum test)
PC17 Mean (std) -0.564 (2.484) 0.162 (1.819) 0.0319 (Wilcoxon rank sum test)
PC7 Mean (std) 0.461 (3.157) -0.132 (3.687) 0.0393 (Wilcoxon rank sum test)
PC1 Mean (std) -2.322 (9.581) 0.666 (10.927) 0.0504 (Wilcoxon rank sum test)
PC3 Mean (std) 0.766 (5.025) -0.220 (5.805) 0.0545 (Wilcoxon rank sum test)
PC23 Mean (std) 0.173 (1.579) -0.050 (1.628) 0.1374 (Wilcoxon rank sum test)
PC26 Mean (std) -0.117 (1.531) 0.034 (1.470) 0.1489 (Wilcoxon rank sum test)
PC38 Mean (std) -0.120 (1.109) 0.034 (1.175) 0.1511 (Wilcoxon rank sum test)
PC21 Mean (std) 0.191 (1.893) -0.055 (1.660) 0.1711 (Wilcoxon rank sum test)
PC35 Mean (std) 0.021 (1.208) -0.006 (1.223) 0.2026 (Wilcoxon rank sum test)
PC34 Mean (std) 0.152 (1.175) -0.044 (1.276) 0.2444 (Wilcoxon rank sum test)
PC32 Mean (std) 0.125 (1.303) -0.036 (1.323) 0.2485 (Wilcoxon rank sum test)
PC25 Mean (std) 0.102 (1.474) -0.029 (1.558) 0.2537 (Wilcoxon rank sum test)
PC37 Mean (std) 0.147 (1.129) -0.042 (1.208) 0.2570 (Wilcoxon rank sum test)
PC36 Mean (std) 0.214 (1.325) -0.061 (1.155) 0.2701 (Wilcoxon rank sum test)
PC19 Mean (std) 0.200 (1.634) -0.057 (1.837) 0.2998 (Wilcoxon rank sum test)
PC28 Mean (std) 0.099 (1.482) -0.029 (1.419) 0.3969 (Wilcoxon rank sum test)
PC15 Mean (std) -0.012 (2.197) 0.003 (2.033) 0.4168 (Wilcoxon rank sum test)
PC13 Mean (std) 0.129 (2.529) -0.037 (2.209) 0.4680 (Wilcoxon rank sum test)
PC31 Mean (std) 0.095 (1.402) -0.027 (1.318) 0.5624 (Wilcoxon rank sum test)
PC39 Mean (std) 0.050 (1.018) -0.014 (1.167) 0.5771 (Wilcoxon rank sum test)
PC16 Mean (std) 0.077 (2.594) -0.022 (1.875) 0.6285 (Wilcoxon rank sum test)
PC6 Mean (std) 0.064 (4.045) -0.018 (3.879) 0.6910 (Wilcoxon rank sum test)
PC27 Mean (std) 0.032 (1.371) -0.009 (1.475) 0.7470 (Wilcoxon rank sum test)
PC33 Mean (std) 0.002 (1.250) -0.001 (1.316) 0.7817 (Wilcoxon rank sum test)
PC22 Mean (std) -0.082 (1.740) 0.023 (1.654) 0.9276 (Wilcoxon rank sum test)
PC24 Mean (std) 0.030 (1.642) -0.009 (1.557) 0.9773 (Wilcoxon rank sum test)
table18 %>% save_kable('tables/table18.pdf')

Though fewer, we can see also some principal components related to Manufacturer model. I will also exclude these to avoid additional bias. I will not correct alfa as I prefer to be sure to exclude any possibly related feature.

out_manuf_vars <- manuf_rad_vars %>%
  separate(pval,into = c('pval'),sep = "<",convert = TRUE) %>% 
  filter(pval < 0.05) 

pca_stable_radiomics <- as.data.frame(pca_rad) %>% select(.,-c(out_manuf_vars$label))

Construction of final datasets

I construct both filtered and pca datasets to be able to compare results. I exclude from the datasets the considered ‘target variables’ including mainly Histology and both components of survival estimates (Survival.time and deadstatus.event). In addition I exclude Study date as clustering models don’t accept this kind of data. Regarding stage I leave only Overall.Stage variable as main representative variable for stage, and to avoid redundancy with clinical T and N stages. As we previously cleared variables related to scanner manufacturer, I will leave it out of the model as well, to avoid potential bias in clustering given its accidental relationship with mean survival.

lung_data_filtered_radiomics <- cbind(lung_data_no_out %>%
                                        dplyr::select(.,-c(starts_with('original'),
                                                    starts_with('wavelet'),
                                                    starts_with('log'),
                                                    Histology,
                                                    deadstatus.event,
                                                    Survival.time,
                                                    Study.Date,
                                                    clinical.T.Stage,
                                                    Clinical.N.Stage,
                                                    Manufacturer)),
                                      log_filtered_radiomics)
(table19 <- describe(lung_data_filtered_radiomics) %>% 
    select(described_variables,mean,sd,skewness,kurtosis,p00,p100,IQR) %>% kable(full_with = FALSE) %>% kable_classic_2()) 
described_variables mean sd skewness kurtosis p00 p100 IQR
age 68.096665 9.9921802 -0.3103496 -0.2241460 33.684900 91.704300 13.9987000
original_shape_VoxelVolume 10.709885 1.1281660 0.0301421 -0.9811701 8.704171 13.130661 1.9256433
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis 16.596304 2.9790162 -0.5269323 -0.2889306 9.271478 23.949001 4.1594001
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis 12.304559 1.9442294 0.2305367 -0.1408371 8.677112 19.321456 2.6000494
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis 8.811209 0.4740488 4.1172886 19.7973688 8.613065 12.367778 0.1353649
wavelet.HHL_glcm_ClusterProminence 8.626584 0.0237955 7.2107196 80.2732600 8.613267 8.936526 0.0130172
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis 9.527623 1.0091983 1.6867352 3.0076951 8.615145 14.346599 1.1783351
wavelet.HHH_firstorder_Kurtosis 8.614346 0.0011190 8.6099300 95.7516675 8.613641 8.629015 0.0004749
log.sigma.5.0.mm.3D_glcm_ClusterProminence 10.247035 0.8847471 0.8261584 0.7811765 8.663811 13.865661 1.2138926
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis 16.961004 2.5515652 -0.2819195 -0.2270397 10.246287 23.350476 3.2895130
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity 8.635605 0.0279432 6.9735518 79.9788610 8.613331 9.000290 0.0209150
wavelet.LLH_glcm_ClusterProminence 8.703342 0.1107818 4.4205321 34.8943190 8.613995 9.870964 0.0840138
wavelet.HLL_glcm_Autocorrelation 8.743171 0.1335679 4.2342995 25.5928234 8.629667 9.949299 0.0840438
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis 8.613049 0.0000005 6.2617483 50.2484643 8.613049 8.613054 0.0000002
wavelet.HHH_glszm_GrayLevelNonUniformity 8.614187 0.0019466 5.4812883 37.4349812 8.613230 8.632686 0.0008071
original_firstorder_Kurtosis 8.615387 0.0038026 8.5319546 108.0609777 8.613328 8.669065 0.0021182
wavelet.HHH_firstorder_Median 8.613048 0.0000059 -1.9307174 15.5618017 8.613012 8.613078 0.0000022
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis 8.848511 0.4684844 3.6449288 15.8855435 8.613058 11.847375 0.2348396
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis 8.613951 0.0013458 10.2659122 154.3303544 8.613080 8.634836 0.0010045
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis 8.613049 0.0000003 2.3362011 6.1842684 8.613049 8.613050 0.0000002
wavelet.HHH_glrlm_GrayLevelVariance 8.613094 0.0000004 4.8123895 29.9377048 8.613094 8.613098 0.0000001
original_gldm_LargeDependenceLowGrayLevelEmphasis 8.613058 0.0000119 5.6403311 39.7674793 8.613050 8.613170 0.0000053
wavelet.HLH_glcm_ClusterShade 8.613052 0.0000185 -1.1593963 46.6697302 8.612862 8.613192 0.0000070
wavelet.HHH_glcm_Imc2 8.613070 0.0000057 0.3163521 -0.2444904 8.613058 8.613088 0.0000079
original_shape_Elongation 8.613181 0.0000294 -0.9660396 1.2928413 8.613060 8.613228 0.0000376
wavelet.LHH_glszm_SizeZoneNonUniformity 8.629478 0.0204540 3.5766078 19.6131261 8.613230 8.802562 0.0167069
log.sigma.2.0.mm.3D_glcm_ClusterShade 8.629758 0.4097874 -17.9399364 339.7000643 0.896731 9.220344 0.0869755
wavelet.HHH_firstorder_Skewness 8.613045 0.0000131 0.4462447 6.8311157 8.612996 8.613127 0.0000113
log.sigma.2.0.mm.3D_firstorder_Kurtosis 8.614327 0.0014558 8.3096981 100.4103061 8.613392 8.634326 0.0008110
wavelet.LLH_glszm_SmallAreaEmphasis 8.613128 0.0000071 -0.1805254 1.2763303 8.613101 8.613155 0.0000090
wavelet.LLH_glcm_MCC 8.613169 0.0000065 -0.2404626 0.3616940 8.613147 8.613190 0.0000087
wavelet.HHL_firstorder_Median 8.613045 0.0000645 0.9902558 7.8275222 8.612786 8.613400 0.0000343
log.sigma.3.0.mm.3D_glcm_Autocorrelation 8.667372 0.0432199 6.4362998 60.2068297 8.626953 9.157573 0.0280360
wavelet.HLH_glszm_SmallAreaEmphasis 8.613136 0.0000117 -0.2021992 1.5944795 8.613088 8.613177 0.0000127
wavelet.HHL_gldm_DependenceNonUniformityNormalized 8.613062 0.0000020 1.4813565 2.3754235 8.613059 8.613071 0.0000022
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis 8.613144 0.0000292 -0.1537463 -0.2057269 8.613060 8.613211 0.0000365
table19 %>% save_kable('tables/table19.pdf')

As we can see value ranges differ significantly between numerical variables (radiomics and age included) this could bias the weight of each variable when contributing to the model so it is recomended to have all variables in the same scale. I chose to use min-max scaling. Then categorical variables will be directly treated by VarCellCM.

filtered_cont_scaled <- lung_data_filtered_radiomics %>% keep(is.numeric) %>% 
  lapply(function(x){(x - min(x)) / (max(x) - min(x))}) %>% as.data.frame()

lung_data_filtered_scaled_radiomics <- lung_data_filtered_radiomics %>% 
  keep(is.factor) %>%
  cbind(filtered_cont_scaled)

(table20 <- describe(lung_data_filtered_scaled_radiomics) %>% 
    select(described_variables,mean,sd,skewness,kurtosis,p00,p100,IQR) %>% kable(full_with = FALSE) %>% kable_classic_2()) 
described_variables mean sd skewness kurtosis p00 p100 IQR
age 0.5931079 0.1722214 -0.3103496 -0.2241460 0 1 0.2412762
original_shape_VoxelVolume 0.4531163 0.2548669 0.0301421 -0.9811701 0 1 0.4350271
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis 0.4990506 0.2029645 -0.5269323 -0.2889306 0 1 0.2833857
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis 0.3407864 0.1826538 0.2305367 -0.1408371 0 1 0.2442658
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis 0.0527722 0.1262543 4.1172886 19.7973688 0 1 0.0360520
wavelet.HHL_glcm_ClusterProminence 0.0411940 0.0736113 7.2107196 80.2732600 0 1 0.0402688
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis 0.1592054 0.1760807 1.6867352 3.0076951 0 1 0.2055910
wavelet.HHH_firstorder_Kurtosis 0.0459154 0.0727851 8.6099300 95.7516675 0 1 0.0308877
log.sigma.5.0.mm.3D_glcm_ClusterProminence 0.3043579 0.1700832 0.8261584 0.7811765 0 1 0.2333579
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis 0.5124099 0.1947137 -0.2819195 -0.2270397 0 1 0.2510276
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity 0.0575621 0.0722124 6.9735518 79.9788610 0 1 0.0540496
wavelet.LLH_glcm_ClusterProminence 0.0710812 0.0881341 4.4205321 34.8943190 0 1 0.0668384
wavelet.HLL_glcm_Autocorrelation 0.0860120 0.1012161 4.2342995 25.5928234 0 1 0.0636873
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis 0.0508890 0.0877819 6.2617483 50.2484643 0 1 0.0291277
wavelet.HHH_glszm_GrayLevelNonUniformity 0.0491774 0.1000537 5.4812883 37.4349812 0 1 0.0414829
original_firstorder_Kurtosis 0.0369325 0.0682237 8.5319546 108.0609777 0 1 0.0380046
wavelet.HHH_firstorder_Median 0.5532604 0.0888468 -1.9307174 15.5618017 0 1 0.0331304
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis 0.0727983 0.1448480 3.6449288 15.8855435 0 1 0.0726087
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis 0.0400464 0.0618577 10.2659122 154.3303544 0 1 0.0461711
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis 0.1630309 0.1725650 2.3362011 6.1842684 0 1 0.1354702
wavelet.HHH_glrlm_GrayLevelVariance 0.0673668 0.1069037 4.8123895 29.9377048 0 1 0.0376577
original_gldm_LargeDependenceLowGrayLevelEmphasis 0.0662456 0.0992506 5.6403311 39.7674793 0 1 0.0444376
wavelet.HLH_glcm_ClusterShade 0.5755339 0.0561711 -1.1593963 46.6697302 0 1 0.0212667
wavelet.HHH_glcm_Imc2 0.3923415 0.1849682 0.3163521 -0.2444904 0 1 0.2581465
original_shape_Elongation 0.7184894 0.1749490 -0.9660396 1.2928413 0 1 0.2239157
wavelet.LHH_glszm_SizeZoneNonUniformity 0.0858169 0.1080327 3.5766078 19.6131261 0 1 0.0882413
log.sigma.2.0.mm.3D_glcm_ClusterShade 0.9290470 0.0492319 -17.9399364 339.7000643 0 1 0.0104493
wavelet.HHH_firstorder_Skewness 0.3736119 0.1005922 0.4462447 6.8311157 0 1 0.0867645
log.sigma.2.0.mm.3D_firstorder_Kurtosis 0.0446155 0.0695423 8.3096981 100.4103061 0 1 0.0387409
wavelet.LLH_glszm_SmallAreaEmphasis 0.5086778 0.1317556 -0.1805254 1.2763303 0 1 0.1670028
wavelet.LLH_glcm_MCC 0.5198564 0.1538946 -0.2404626 0.3616940 0 1 0.2060200
wavelet.HHL_firstorder_Median 0.4216296 0.1050188 0.9902558 7.8275222 0 1 0.0558008
log.sigma.3.0.mm.3D_glcm_Autocorrelation 0.0761722 0.0814517 6.4362998 60.2068297 0 1 0.0528364
wavelet.HLH_glszm_SmallAreaEmphasis 0.5395254 0.1313199 -0.2021992 1.5944795 0 1 0.1420057
wavelet.HHL_gldm_DependenceNonUniformityNormalized 0.2347545 0.1653278 1.4813565 2.3754235 0 1 0.1822464
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis 0.5536895 0.1942355 -0.1537463 -0.2057269 0 1 0.2425123
table20 %>% save_kable('tables/table20.pdf')

Now I construct the same dataset but including previously selected PCA radiomics.

lung_data_pca_radiomics <- cbind(lung_data_no_out %>%
                                        dplyr::select(.,-c(starts_with('original'),
                                                    starts_with('wavelet'),
                                                    starts_with('log'),
                                                    Histology,
                                                    deadstatus.event,
                                                    Survival.time,
                                                    Study.Date,
                                                    clinical.T.Stage,
                                                    Clinical.N.Stage,
                                                    Manufacturer)),
                                      pca_rad)
(table21 <- describe(lung_data_pca_radiomics) %>% 
    select(described_variables,mean,sd,skewness,kurtosis,p00,p100,IQR) %>% kable(full_with = FALSE) %>% kable_classic_2()) 
described_variables mean sd skewness kurtosis p00 p100 IQR
age 68.0966654 9.992180 -0.3103496 -0.224146 33.684900 91.704300 13.998700
PC1 -0.0000002 10.702625 0.5550550 -0.040557 -26.332791 29.984646 14.119994
PC2 0.0000001 9.339389 0.4478609 1.088354 -29.428867 34.136508 10.861605
PC3 0.0000000 5.649295 1.0734318 2.313619 -10.772835 30.173664 6.890912
PC4 0.0000000 5.002116 -0.3868824 1.130599 -21.940297 12.065160 6.047057
PC5 0.0000002 4.246926 -0.0964084 3.964733 -16.287446 23.714593 4.616675
PC6 0.0000002 3.911156 1.5892997 6.612625 -9.895964 25.350092 4.016538
PC7 0.0000001 3.580090 0.6781949 2.570119 -10.924504 17.053005 4.310682
PC8 -0.0000002 3.275129 -0.3335332 2.811490 -15.542677 12.771833 3.690019
PC9 -0.0000002 3.084166 0.3464458 4.827625 -12.715937 18.017471 3.435268
PC10 -0.0000003 2.834371 0.3176528 1.712059 -8.597879 13.564252 3.521782
PC11 0.0000002 2.485266 -0.0419981 2.680690 -10.680815 11.020459 2.753917
PC12 0.0000000 2.379883 -0.2204075 2.394319 -11.810489 8.702184 2.670202
PC13 -0.0000001 2.281633 -0.0476898 1.386233 -7.727466 7.309715 2.671543
PC14 0.0000002 2.205923 -0.0373089 2.197230 -10.653797 9.108113 2.781478
PC15 0.0000003 2.067518 0.6804244 3.165130 -8.530480 11.042273 2.325751
PC16 0.0000002 2.053691 -0.1939465 6.335232 -11.237367 11.933086 2.284062
PC17 -0.0000001 2.005559 -0.9101037 5.704436 -13.387733 5.407721 2.142603
PC18 -0.0000003 1.886330 -0.6791186 2.106123 -8.511111 4.542458 2.034833
PC19 -0.0000001 1.795158 -0.1509900 8.126797 -12.298470 8.005069 1.729620
PC20 0.0000001 1.779808 -0.3851491 3.990909 -10.106732 7.943143 1.938966
PC21 0.0000002 1.715134 0.1503979 6.051416 -7.658191 8.803511 1.492654
PC22 -0.0000001 1.671909 0.1290350 4.698228 -9.567231 8.325323 1.887591
PC23 -0.0000001 1.617828 0.4883454 4.219930 -6.058912 9.210277 1.792301
PC24 -0.0000002 1.574196 0.5459982 3.389876 -5.230692 8.446571 1.635901
PC25 0.0000001 1.538824 -0.0328522 1.131646 -5.095086 5.547838 1.836553
PC26 0.0000005 1.483264 -0.4738358 1.619419 -6.676081 4.960074 1.569154
PC27 -0.0000004 1.450769 0.0653712 5.065931 -7.871934 6.820803 1.433714
PC28 0.0000003 1.432399 0.1385086 1.132057 -4.264915 5.865184 1.628827
PC29 -0.0000002 1.377224 -0.2302993 4.160400 -6.448974 6.057279 1.235087
PC30 -0.0000003 1.359600 -0.0424923 1.450056 -5.491316 6.025697 1.584423
PC31 0.0000000 1.336330 0.4437062 4.415112 -4.445805 7.654492 1.472052
PC32 0.0000003 1.318652 -0.1417663 4.970859 -7.878787 5.835954 1.314457
PC33 -0.0000001 1.299939 0.1707649 3.306426 -4.847268 7.051430 1.448928
PC34 -0.0000003 1.255029 -0.5616045 3.341355 -7.256415 4.243991 1.415922
PC35 0.0000002 1.217802 0.1557651 2.213162 -4.718653 5.783280 1.509045
PC36 0.0000002 1.198445 -0.0986266 2.228830 -4.805565 3.998673 1.330535
PC37 -0.0000002 1.192104 -0.1391817 2.077760 -4.367936 5.017335 1.364204
PC38 0.0000001 1.161071 -0.1426405 1.750408 -5.267446 3.440354 1.306416
PC39 0.0000004 1.134753 0.7418311 4.964879 -3.970764 6.506781 1.166654
PC40 -0.0000003 1.121473 0.1960900 2.281263 -4.981194 4.243599 1.333379
table21 %>% save_kable('tables/table21.pdf')

I will repeat the min-max scaling with PCA dataset to be sure all continuous variables have the same range of values.

pca_cont_scaled <- lung_data_pca_radiomics %>% keep(is.numeric) %>% 
  lapply(function(x){(x - min(x)) / (max(x) - min(x))}) %>% as.data.frame()

lung_data_pca_scaled_radiomics <- lung_data_pca_radiomics %>% 
  keep(is.factor) %>%
  cbind(pca_cont_scaled)

(table22 <- describe(lung_data_pca_scaled_radiomics) %>% 
    select(described_variables,mean,sd,skewness,kurtosis,p00,p100,IQR) %>% kable(full_with = FALSE) %>% kable_classic_2()) 
described_variables mean sd skewness kurtosis p00 p100 IQR
age 0.5931079 0.1722214 -0.3103496 -0.224146 0 1 0.2412762
PC1 0.4675779 0.1900411 0.5550550 -0.040557 0 1 0.2507215
PC2 0.4629701 0.1469257 0.4478609 1.088354 0 1 0.1708730
PC3 0.2630954 0.1379677 1.0734318 2.313619 0 1 0.1682906
PC4 0.6451993 0.1470974 -0.3868824 1.130599 0 1 0.1778261
PC5 0.4071654 0.1061677 -0.0964084 3.964733 0 1 0.1154110
PC6 0.2807680 0.1109672 1.5892997 6.612625 0 1 0.1139571
PC7 0.3904745 0.1279631 0.6781949 2.570119 0 1 0.1540767
PC8 0.5489297 0.1156696 -0.3335332 2.811490 0 1 0.1303225
PC9 0.4137496 0.1003522 0.3464458 4.827625 0 1 0.1117763
PC10 0.3879536 0.1278925 0.3176528 1.712059 0 1 0.1589099
PC11 0.4921746 0.1145217 -0.0419981 2.680690 0 1 0.1269012
PC12 0.5757655 0.1160201 -0.2204075 2.394319 0 1 0.1301733
PC13 0.5138906 0.1517328 -0.0476898 1.386233 0 1 0.1776625
PC14 0.5391077 0.1116250 -0.0373089 2.197230 0 1 0.1407494
PC15 0.4358345 0.1056324 0.6804244 3.165130 0 1 0.1188259
PC16 0.4849869 0.0886340 -0.1939465 6.335232 0 1 0.0985765
PC17 0.7122857 0.1067045 -0.9101037 5.704436 0 1 0.1139958
PC18 0.6520141 0.1445068 -0.6791186 2.106123 0 1 0.1558833
PC19 0.6057303 0.0884160 -0.1509900 8.126797 0 1 0.0851881
PC20 0.5599336 0.0986050 -0.3851491 3.990909 0 1 0.1074227
PC21 0.4652126 0.1041893 0.1503979 6.051416 0 1 0.0906743
PC22 0.5347046 0.0934416 0.1290350 4.698228 0 1 0.1054959
PC23 0.3968064 0.1059538 0.4883454 4.219930 0 1 0.1173803
PC24 0.3824370 0.1150958 0.5459982 3.389876 0 1 0.1196073
PC25 0.4787300 0.1445865 -0.0328522 1.131646 0 1 0.1725610
PC26 0.5737361 0.1274703 -0.4738358 1.619419 0 1 0.1348516
PC27 0.5357704 0.0987405 0.0653712 5.065931 0 1 0.0975797
PC28 0.4210142 0.1414003 0.1385086 1.132057 0 1 0.1607909
PC29 0.5156599 0.1101228 -0.2302993 4.160400 0 1 0.0987576
PC30 0.4768003 0.1180514 -0.0424923 1.450056 0 1 0.1375724
PC31 0.3674129 0.1104378 0.4437062 4.415112 0 1 0.1216542
PC32 0.5744758 0.0961485 -0.1417663 4.970859 0 1 0.0958426
PC33 0.4073780 0.1092505 0.1707649 3.306426 0 1 0.1217720
PC34 0.6309703 0.1091291 -0.5616045 3.341355 0 1 0.1231193
PC35 0.4493128 0.1159598 0.1557651 2.213162 0 1 0.1436922
PC36 0.5458241 0.1361214 -0.0986266 2.228830 0 1 0.1511244
PC37 0.4654033 0.1270186 -0.1391817 2.077760 0 1 0.1453559
PC38 0.6049112 0.1333369 -0.1426405 1.750408 0 1 0.1500282
PC39 0.3789785 0.1083034 0.7418311 4.964879 0 1 0.1113481
PC40 0.5399789 0.1215716 0.1960900 2.281263 0 1 0.1445430
table22 %>% save_kable('tables/table22.pdf')

Clustering

Clustering mixed data with VarCellCM

With varsel variable selection

Data MRMR filtered Radiomics

First I will generate a model based clustering model using the MRMR filtered radiomics dataset with the added clinical variables and scaling done in previous points. I generate the model using VarSelCluster function from VarSelCM package, using variable selection. I display main discriminative variables given the model adjustment and clusters in 3D scatter plot using these variables as main axes for the representation. I also display the uncertainty probability for each cluster.

# I adjust VarSelCluster model without variables selection for this first
# approach and using BIC criteria to estimate the optimal number of clusters
set.seed(12345)
res_varsel <- VarSelCluster(x = lung_data_filtered_scaled_radiomics,
                          gvals = 2:10, 
                          nbcores = 4, 
                          vbleSelec = TRUE,  
                          crit.varsel = "BIC")

# I check the resulting model summary
summary(res_varsel)
## Model:
##    Number of components: 10 
##    Model selection has been performed according to the BIC  criterion 
##    Variable selection has been performed, 31  ( 81.58 % ) of the variables are relevant for clustering 
## 
res_varsel@model@names.relevant
##  [1] "original_shape_VoxelVolume"                             
##  [2] "wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis"       
##  [3] "wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis"        
##  [4] "log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis"
##  [5] "wavelet.HHL_glcm_ClusterProminence"                     
##  [6] "wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis"        
##  [7] "wavelet.HHH_firstorder_Kurtosis"                        
##  [8] "log.sigma.5.0.mm.3D_glcm_ClusterProminence"             
##  [9] "wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis"        
## [10] "log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity"        
## [11] "wavelet.LLH_glcm_ClusterProminence"                     
## [12] "wavelet.HLL_glcm_Autocorrelation"                       
## [13] "log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis"
## [14] "wavelet.HHH_glszm_GrayLevelNonUniformity"               
## [15] "original_firstorder_Kurtosis"                           
## [16] "wavelet.HHH_firstorder_Median"                          
## [17] "log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis"
## [18] "wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis"  
## [19] "wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis"   
## [20] "wavelet.HHH_glrlm_GrayLevelVariance"                    
## [21] "original_gldm_LargeDependenceLowGrayLevelEmphasis"      
## [22] "wavelet.HLH_glcm_ClusterShade"                          
## [23] "wavelet.LHH_glszm_SizeZoneNonUniformity"                
## [24] "log.sigma.2.0.mm.3D_glcm_ClusterShade"                  
## [25] "log.sigma.2.0.mm.3D_firstorder_Kurtosis"                
## [26] "wavelet.LLH_glszm_SmallAreaEmphasis"                    
## [27] "wavelet.HHL_firstorder_Median"                          
## [28] "log.sigma.3.0.mm.3D_glcm_Autocorrelation"               
## [29] "wavelet.HLH_glszm_SmallAreaEmphasis"                    
## [30] "wavelet.HHL_gldm_DependenceNonUniformityNormalized"     
## [31] "wavelet.HHH_glszm_LowGrayLevelZoneEmphasis"
res_varsel@model@names.irrelevant
## [1] "age"                             "wavelet.HHH_glcm_Imc2"          
## [3] "original_shape_Elongation"       "wavelet.HHH_firstorder_Skewness"
## [5] "wavelet.LLH_glcm_MCC"            "Overall.Stage"                  
## [7] "gender"
# I check the discriminative power of the variables 
tiff(filename = "figures/Fig22.tiff", width = 400, height = 150, units = "mm", res = 300)
plot(res_varsel)
fig <- dev.off()
plot(res_varsel)

# I build q data frame including partitions, variables used in the model and other variables of interest
lung_data_res_varsel <- lung_data_filtered_scaled_radiomics %>% 
                              cbind(lung_data_no_out %>% select(Histology,
                                                                Survival.time,
                                                                deadstatus.event,
                                                                Manufacturer),
                                    partition = as.factor(res_varsel@partitions@zMAP))


# I generate a representation of the clusters using the main discriminative variables to define the axes
# (screencapture saved as figure 23)
plot_ly(lung_data_res_varsel, x=~original_shape_VoxelVolume, y=~wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis,
        z=~wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis, color=~partition) %>%
        add_markers(size=1.5)%>%
        layout(
          scene = list(
            xaxis = list(size = 0.5),
            yaxis = list(size = 0.5),
            zaxis = list(size = 0.5)))
# I display a summary of the probabilities of missclassification for each cluster
tiff(filename = "figures/Fig24.tiff", width = 400, height = 150, units = "mm", res = 300)
plot(res_varsel, type="probs-class")
fig <- dev.off()
plot(res_varsel, type="probs-class")

Then I want to evaluate possible relationship between clusters generated and histology class.

# I generate a cross table to check distribution of histology classes with partitions
table_hist_clust_res_varsel <- table(lung_data_res_varsel$Histology,lung_data_res_varsel$partition)
addmargins(table_hist_clust_res_varsel)
##                          
##                             1   2   3   4   5   6   7   8   9  10 Sum
##   adenocarcinoma            3   7   7   5   4   2   7   7   4   5  51
##   large cell                8  16   5  13  13   5  14  20   8  12 114
##   nos                       4   6   7   4   6   3   8  10   8   5  61
##   squamous cell carcinoma  17  24  11  12  22   9  15  23  10   8 151
##   Sum                      32  53  30  34  45  19  44  60  30  30 377
# I evaluate general association between clusters and histology class using chi 2 test
chisq.test(table_hist_clust_res_varsel) 
## 
##  Pearson's Chi-squared test
## 
## data:  table_hist_clust_res_varsel
## X-squared = 18.277, df = 27, p-value = 0.8948
# I display distribution of histology class within different clusters
ggplot(lung_data_res_varsel,aes(partition, fill = Histology)) + 
  geom_bar(position = 'fill',alpha = .5) +
  coord_flip()+
  theme_bw() 

ggsave("figures/Fig25.tiff",width = 100, height = 50, device="tiff", dpi=300, units = "mm", scale = 1)


# I continue with a correspondence analysis and generate an assymetric representation using 
# columns (clusters) repsented with the principal coordinates and histology classes
# represented with standard coordinates
ca.res_varsel <- ca(table_hist_clust_res_varsel)
tiff(filename = "figures/Fig26.tiff", width = 200, height = 200, units = "mm", res = 300)
plot(ca.res_varsel, map= "colprincipal") 
fig <- dev.off()
plot(ca.res_varsel, map= "colprincipal") 

# From CA analisis I get number of clusters with the greatest inertias to further compare cluster characteristics
clust_top_inertias <- data.frame(clust = ca.res_varsel$colnames, inertia = get_ca_col(ca.res_varsel)$inertia) %>% arrange(desc(inertia)) %>% top_n(3)

# After evaluating main clusters associated with different histology classes we can focus our analysis in 
# describing differences between these clusters:
lung_data_res_varsel_selected <- lung_data_res_varsel %>% filter(partition %in% clust_top_inertias$clust) %>% droplevels()
lung_data_res_varsel_selected_table <- table(lung_data_res_varsel_selected$Histology,lung_data_res_varsel_selected$partition)


# I evaluate agreement using adjusted rand index (ARI) coefficient
cat('ARI (Histology class, Partitions):',ARI(lung_data_res_varsel_selected$Histology,lung_data_res_varsel_selected$partition))
## ARI (Histology class, Partitions): -0.003684613
# I repeat a chi2 test just with this clusters
chisq.test(lung_data_res_varsel_selected_table) 
## 
##  Pearson's Chi-squared test
## 
## data:  lung_data_res_varsel_selected_table
## X-squared = 5.0178, df = 6, p-value = 0.5415
# And I generate a cross table focusing on these clusters
(crosstab1<- crosstable(lung_data_res_varsel_selected %>% select(.,-c(deadstatus.event,Survival.time)), 
           by = partition, test = TRUE)) 
## # A tibble: 156 × 7
##    .id           label         variable   `3`           `9`          `10`  test 
##    <chr>         <chr>         <chr>      <chr>         <chr>        <chr> <chr>
##  1 Overall.Stage Overall.Stage I          1 (7.69%)     4 (30.77%)   8 (6… "p v…
##  2 Overall.Stage Overall.Stage II         3 (37.50%)    3 (37.50%)   2 (2… "p v…
##  3 Overall.Stage Overall.Stage IIIa       9 (37.50%)    12 (50.00%)  3 (1… "p v…
##  4 Overall.Stage Overall.Stage IIIb       17 (37.78%)   11 (24.44%)  17 (… "p v…
##  5 gender        gender        female     10 (30.30%)   15 (45.45%)  8 (2… "p v…
##  6 gender        gender        male       20 (35.09%)   15 (26.32%)  22 (… "p v…
##  7 age           age           Min / Max  0.2 / 0.9     0.2 / 0.8    0.3 … "p v…
##  8 age           age           Med [IQR]  0.5 [0.4;0.7] 0.6 [0.5;0.… 0.7 … "p v…
##  9 age           age           Mean (std) 0.5 (0.2)     0.6 (0.2)    0.6 … "p v…
## 10 age           age           N (NA)     30 (0)        30 (0)       30 (… "p v…
## # … with 146 more rows
 (table23 <- crosstab1 %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable != 'Med [IQR]',
                   variable != 'N (NA)',
                   variable != 'Min / Max') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2())
label variable 3 9 10 pval test
original_shape_VoxelVolume Mean (std) 0.8 (0.2) 0.2 (0.1) 0.8 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis Mean (std) 0.7 (0.1) 0.3 (0.1) 0.7 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.7 (0.1) 0.2 (0.1) 0.5 (0.1) <0.0001 (One-way analysis of means (not assuming equal variances))
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.4 (0.3) 7e-04 (6e-04) 0.1 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_glcm_ClusterProminence Mean (std) 0.002 (0.002) 0.1 (0.1) 0.02 (0.01) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.6 (0.2) 0.03 (0.05) 0.2 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
log.sigma.5.0.mm.3D_glcm_ClusterProminence Mean (std) 0.1 (0.1) 0.4 (0.2) 0.2 (0.1) <0.0001 (One-way analysis of means (not assuming equal variances))
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.8 (0.1) 0.3 (0.1) 0.6 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity Mean (std) 0.1 (0.1) 0.02 (0.02) 0.1 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.LLH_glcm_ClusterProminence Mean (std) 0.009 (0.006) 0.1 (0.1) 0.02 (0.009) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HLL_glcm_Autocorrelation Mean (std) 0.1 (0.04) 0.05 (0.04) 0.1 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHH_glszm_GrayLevelNonUniformity Mean (std) 0.04 (0.02) 0.01 (0.01) 0.1 (0.05) <0.0001 (Kruskal-Wallis rank sum test)
original_firstorder_Kurtosis Mean (std) 0.1 (0.2) 0.007 (0.006) 0.1 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.4 (0.3) 0.002 (0.001) 0.2 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis Mean (std) 0.006 (0.004) 0.04 (0.04) 0.04 (0.02) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis Mean (std) 0.1 (0.1) 0.3 (0.2) 0.1 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHH_glrlm_GrayLevelVariance Mean (std) 0.02 (2e-04) 0.1 (0.1) 0.04 (0.01) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.LHH_glszm_SizeZoneNonUniformity Mean (std) 0.1 (0.1) 0.03 (0.02) 0.2 (0.2) <0.0001 (Kruskal-Wallis rank sum test)
log.sigma.2.0.mm.3D_firstorder_Kurtosis Mean (std) 0.1 (0.2) 0.02 (0.01) 0.1 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_gldm_DependenceNonUniformityNormalized Mean (std) 0.4 (0.2) 0.3 (0.2) 0.3 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
original_gldm_LargeDependenceLowGrayLevelEmphasis Mean (std) 0.1 (0.04) 0.1 (0.2) 0.05 (0.03) 0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHH_glcm_Imc2 Mean (std) 0.3 (0.2) 0.4 (0.1) 0.5 (0.2) 0.0002 (Kruskal-Wallis rank sum test)
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis Mean (std) 0.6 (0.2) 0.6 (0.2) 0.5 (0.2) 0.0007 (One-way analysis of means)
wavelet.HHH_firstorder_Kurtosis Mean (std) 0.1 (0.04) 0.02 (0.02) 0.1 (0.1) 0.0014 (Kruskal-Wallis rank sum test)
log.sigma.2.0.mm.3D_glcm_ClusterShade Mean (std) 0.9 (0.002) 0.9 (0.2) 0.9 (0.004) 0.0025 (Kruskal-Wallis rank sum test)
log.sigma.3.0.mm.3D_glcm_Autocorrelation Mean (std) 0.1 (0.03) 0.1 (0.1) 0.1 (0.05) 0.0121 (Kruskal-Wallis rank sum test)
Manufacturer CMS Inc.  3 (15.00%) 5 (25.00%) 12 (60.00%) 0.0135 (Pearson’s Chi-squared test)
Manufacturer SIEMENS 27 (38.57%) 25 (35.71%) 18 (25.71%) 0.0135 (Pearson’s Chi-squared test)
Overall.Stage I 1 (7.69%) 4 (30.77%) 8 (61.54%) 0.0318 (Fisher’s Exact Test for Count Data)
Overall.Stage II 3 (37.50%) 3 (37.50%) 2 (25.00%) 0.0318 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIa 9 (37.50%) 12 (50.00%) 3 (12.50%) 0.0318 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIb 17 (37.78%) 11 (24.44%) 17 (37.78%) 0.0318 (Fisher’s Exact Test for Count Data)
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis Mean (std) 0.1 (0.1) 0.1 (0.1) 0.03 (0.01) 0.0652 (Kruskal-Wallis rank sum test)
age Mean (std) 0.5 (0.2) 0.6 (0.2) 0.6 (0.2) 0.0937 (One-way analysis of means)
wavelet.LLH_glszm_SmallAreaEmphasis Mean (std) 0.5 (0.1) 0.5 (0.2) 0.5 (0.1) 0.1068 (One-way analysis of means)
wavelet.HHL_firstorder_Median Mean (std) 0.4 (0.009) 0.4 (0.1) 0.4 (0.02) 0.1092 (One-way analysis of means (not assuming equal variances))
gender female 10 (30.30%) 15 (45.45%) 8 (24.24%) 0.1547 (Pearson’s Chi-squared test)
gender male 20 (35.09%) 15 (26.32%) 22 (38.60%) 0.1547 (Pearson’s Chi-squared test)
wavelet.HLH_glcm_ClusterShade Mean (std) 0.6 (0.005) 0.6 (0.04) 0.6 (0.01) 0.2717 (Kruskal-Wallis rank sum test)
wavelet.HLH_glszm_SmallAreaEmphasis Mean (std) 0.6 (0.2) 0.6 (0.2) 0.6 (0.1) 0.4490 (Kruskal-Wallis rank sum test)
wavelet.HHH_firstorder_Skewness Mean (std) 0.4 (0.1) 0.4 (0.2) 0.4 (0.1) 0.5208 (Kruskal-Wallis rank sum test)
Histology adenocarcinoma 7 (43.75%) 4 (25.00%) 5 (31.25%) 0.5415 (Pearson’s Chi-squared test)
Histology large cell 5 (20.00%) 8 (32.00%) 12 (48.00%) 0.5415 (Pearson’s Chi-squared test)
Histology nos 7 (35.00%) 8 (40.00%) 5 (25.00%) 0.5415 (Pearson’s Chi-squared test)
Histology squamous cell carcinoma 11 (37.93%) 10 (34.48%) 8 (27.59%) 0.5415 (Pearson’s Chi-squared test)
wavelet.LLH_glcm_MCC Mean (std) 0.6 (0.2) 0.5 (0.2) 0.5 (0.2) 0.7608 (One-way analysis of means)
wavelet.HHH_firstorder_Median Mean (std) 0.6 (0.007) 0.6 (0.1) 0.6 (0.02) 0.7894 (One-way analysis of means (not assuming equal variances))
original_shape_Elongation Mean (std) 0.7 (0.1) 0.7 (0.2) 0.7 (0.1) 0.8901 (Kruskal-Wallis rank sum test)
 table23 %>% save_kable("tables/table23.png")

Finally I want to analyze survival probabilities among different clusters.

# I generate a specific dataframe including survival, dead status event and partition variables
df_surv_res_varsel <- data.frame(Survival.time = lung_data_no_out$Survival.time, 
                                  deadstatus.event = lung_data_no_out$deadstatus.event,
                         partition = res_varsel@partitions@zMAP)

# I create a survival object, and use it as response variable 
# to create Kaplan-Meier survival curves for each cluster
sfit_res_varsel <- survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ partition, 
                            data = df_surv_res_varsel)

# I plot survival curve including confidence intervals for each curve and 
# p value result from log-rank test to evaluate difference in survival between groups
tiff(filename = "figures/Fig27.tiff", width = 300, height = 200, units = "mm", res = 300)
(surv <- ggsurvplot(sfit_res_varsel, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
           tables.height = 0.3))
fig <- dev.off()
(surv <- ggsurvplot(sfit_res_varsel, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
           tables.height = 0.3))

min_max_surv <- surv_median(sfit_res_varsel) %>% 
  filter(median == min(median) | median == max(median)) %>% 
  select(strata) %>% 
  separate(strata,
            into = c('string', 
                    'partition_num'),
            sep = "=",
            convert = TRUE) 
min_max_surv$partition_num
## [1]  2 10
# I repeat log-rank test comparing only the two most extreme median survival groups 
surv_pvalue(
  survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ partition, 
          data = df_surv_res_varsel %>% filter(partition %in% min_max_surv$partition_num)),
  method = "survdiff",
  test.for.trend = FALSE,
  combine = FALSE
)
##    variable        pval   method   pval.txt
## 1 partition 0.002720169 Log-rank p = 0.0027
# And describe composition for the clusters with most extreme results on survival analisis
crosstab2<- crosstable(lung_data_res_varsel %>% filter(partition %in% min_max_surv$partition_num) %>% select(.,-c(deadstatus.event,Survival.time)) %>% droplevels(), 
           by = partition, test = TRUE)

 (table24 <- crosstab2 %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable != 'Med [IQR]',
                   variable != 'N (NA)',
                   variable != 'Min / Max') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2())
label variable 2 10 pval test
original_shape_VoxelVolume Mean (std) 0.3 (0.1) 0.8 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis Mean (std) 0.4 (0.1) 0.7 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.3 (0.1) 0.5 (0.1) <0.0001 (Two Sample t-test)
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.003 (0.002) 0.1 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_glcm_ClusterProminence Mean (std) 0.01 (0.007) 0.02 (0.01) <0.0001 (Wilcoxon rank sum test)
log.sigma.5.0.mm.3D_glcm_ClusterProminence Mean (std) 0.3 (0.1) 0.2 (0.1) <0.0001 (Welch Two Sample t-test)
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.5 (0.1) 0.6 (0.1) <0.0001 (Two Sample t-test)
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity Mean (std) 0.03 (0.02) 0.1 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.LLH_glcm_ClusterProminence Mean (std) 0.1 (0.04) 0.02 (0.009) <0.0001 (Wilcoxon rank sum test)
wavelet.HLL_glcm_Autocorrelation Mean (std) 0.04 (0.02) 0.1 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glszm_GrayLevelNonUniformity Mean (std) 0.01 (0.009) 0.1 (0.05) <0.0001 (Wilcoxon rank sum test)
original_firstorder_Kurtosis Mean (std) 0.02 (0.02) 0.1 (0.1) <0.0001 (Wilcoxon rank sum test)
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.007 (0.007) 0.2 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis Mean (std) 0.01 (0.008) 0.04 (0.02) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis Mean (std) 0.2 (0.1) 0.1 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glrlm_GrayLevelVariance Mean (std) 0.02 (2e-04) 0.04 (0.01) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glcm_Imc2 Mean (std) 0.3 (0.2) 0.5 (0.2) <0.0001 (Wilcoxon rank sum test)
wavelet.LHH_glszm_SizeZoneNonUniformity Mean (std) 0.02 (0.01) 0.2 (0.2) <0.0001 (Wilcoxon rank sum test)
log.sigma.2.0.mm.3D_firstorder_Kurtosis Mean (std) 0.02 (0.01) 0.1 (0.1) <0.0001 (Wilcoxon rank sum test)
log.sigma.3.0.mm.3D_glcm_Autocorrelation Mean (std) 0.04 (0.02) 0.1 (0.05) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_gldm_DependenceNonUniformityNormalized Mean (std) 0.1 (0.1) 0.3 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis Mean (std) 0.7 (0.1) 0.5 (0.2) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.1 (0.1) 0.2 (0.1) 0.0001 (Wilcoxon rank sum test)
Manufacturer CMS Inc.  5 (29.41%) 12 (70.59%) 0.0009 (Pearson’s Chi-squared test)
Manufacturer SIEMENS 48 (72.73%) 18 (27.27%) 0.0009 (Pearson’s Chi-squared test)
wavelet.LLH_glszm_SmallAreaEmphasis Mean (std) 0.4 (0.1) 0.5 (0.1) 0.0014 (Two Sample t-test)
wavelet.HLH_glszm_SmallAreaEmphasis Mean (std) 0.5 (0.2) 0.6 (0.1) 0.0174 (Wilcoxon rank sum test)
Overall.Stage I 11 (57.89%) 8 (42.11%) 0.0328 (Fisher’s Exact Test for Count Data)
Overall.Stage II 4 (66.67%) 2 (33.33%) 0.0328 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIa 20 (86.96%) 3 (13.04%) 0.0328 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIb 18 (51.43%) 17 (48.57%) 0.0328 (Fisher’s Exact Test for Count Data)
wavelet.HHH_firstorder_Kurtosis Mean (std) 0.03 (0.02) 0.1 (0.1) 0.0567 (Wilcoxon rank sum test)
age Mean (std) 0.6 (0.2) 0.6 (0.2) 0.0747 (Two Sample t-test)
log.sigma.2.0.mm.3D_glcm_ClusterShade Mean (std) 0.9 (0.007) 0.9 (0.004) 0.0898 (Wilcoxon rank sum test)
wavelet.HHH_firstorder_Median Mean (std) 0.6 (0.04) 0.6 (0.02) 0.1300 (Welch Two Sample t-test)
wavelet.HHL_firstorder_Median Mean (std) 0.4 (0.1) 0.4 (0.02) 0.3146 (Welch Two Sample t-test)
wavelet.HLH_glcm_ClusterShade Mean (std) 0.6 (0.01) 0.6 (0.01) 0.3384 (Wilcoxon rank sum test)
Histology adenocarcinoma 7 (58.33%) 5 (41.67%) 0.3959 (Fisher’s Exact Test for Count Data)
Histology large cell 16 (57.14%) 12 (42.86%) 0.3959 (Fisher’s Exact Test for Count Data)
Histology nos 6 (54.55%) 5 (45.45%) 0.3959 (Fisher’s Exact Test for Count Data)
Histology squamous cell carcinoma 24 (75.00%) 8 (25.00%) 0.3959 (Fisher’s Exact Test for Count Data)
gender female 18 (69.23%) 8 (30.77%) 0.4912 (Pearson’s Chi-squared test)
gender male 35 (61.40%) 22 (38.60%) 0.4912 (Pearson’s Chi-squared test)
original_shape_Elongation Mean (std) 0.7 (0.2) 0.7 (0.1) 0.5378 (Wilcoxon rank sum test)
wavelet.HHH_firstorder_Skewness Mean (std) 0.4 (0.1) 0.4 (0.1) 0.6628 (Wilcoxon rank sum test)
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis Mean (std) 0.04 (0.03) 0.03 (0.01) 0.6975 (Wilcoxon rank sum test)
wavelet.LLH_glcm_MCC Mean (std) 0.5 (0.1) 0.5 (0.2) 0.7107 (Welch Two Sample t-test)
original_gldm_LargeDependenceLowGrayLevelEmphasis Mean (std) 0.1 (0.03) 0.05 (0.03) 0.7401 (Wilcoxon rank sum test)
 table24 %>% save_kable("tables/table24.png")
Data PCA Radiomics
set.seed(12345)

# I adjust VarSelCluster model without variables selection for this first
# approach and using BIC criteria to estimate the number of partitions
res_varsel_pca <- VarSelCluster(x = lung_data_pca_scaled_radiomics,
                          gvals = 2:10, 
                          nbcores = 4, 
                          vbleSelec = TRUE,  
                          crit.varsel = "BIC")

# I check the resulting model summary
summary(res_varsel_pca)
## Model:
##    Number of components: 3 
##    Model selection has been performed according to the BIC  criterion 
##    Variable selection has been performed, 40  ( 93.02 % ) of the variables are relevant for clustering 
## 
res_varsel_pca@model
## An object of class "VSLCMmodel"
## Slot "g":
## [1] 3
## 
## Slot "omega":
##           age           PC1           PC2           PC3           PC4 
##             0             1             1             1             1 
##           PC5           PC6           PC7           PC8           PC9 
##             1             1             1             1             1 
##          PC10          PC11          PC12          PC13          PC14 
##             1             1             1             1             1 
##          PC15          PC16          PC17          PC18          PC19 
##             1             1             1             1             1 
##          PC20          PC21          PC22          PC23          PC24 
##             1             1             1             1             1 
##          PC25          PC26          PC27          PC28          PC29 
##             1             1             1             1             1 
##          PC30          PC31          PC32          PC33          PC34 
##             1             1             1             1             1 
##          PC35          PC36          PC37          PC38          PC39 
##             1             1             1             1             1 
##          PC40 Overall.Stage        gender 
##             1             0             0 
## 
## Slot "names.relevant":
##  [1] "PC1"  "PC2"  "PC3"  "PC4"  "PC5"  "PC6"  "PC7"  "PC8"  "PC9"  "PC10"
## [11] "PC11" "PC12" "PC13" "PC14" "PC15" "PC16" "PC17" "PC18" "PC19" "PC20"
## [21] "PC21" "PC22" "PC23" "PC24" "PC25" "PC26" "PC27" "PC28" "PC29" "PC30"
## [31] "PC31" "PC32" "PC33" "PC34" "PC35" "PC36" "PC37" "PC38" "PC39" "PC40"
## 
## Slot "names.irrelevant":
## [1] "age"           "Overall.Stage" "gender"
# I check the discriminative power of the variables 
tiff(filename = "figures/Fig28.tiff", width = 400, height = 150, units = "mm", res = 300)
plot(res_varsel_pca)
fig <- dev.off()
plot(res_varsel_pca)

# I build q data frame including partitions, variables used in the model and other variables of interest
lung_data_res_varsel_pca <- lung_data_pca_scaled_radiomics %>% 
                              cbind(lung_data_no_out %>% select(Histology,
                                                                Survival.time,
                                                                deadstatus.event,
                                                                Manufacturer),
                                    partition = as.factor(res_varsel_pca@partitions@zMAP))


# I generate a representation of the clusters using the main discriminative variables to define the axes
# (screencapture saved as figure 29)
plot_ly(lung_data_res_varsel_pca, x=~PC1, y=~PC2,
        z=~PC3, color=~partition) %>%
        add_markers(size=1.5)%>%
        layout(
          scene = list(
            xaxis = list(size = 0.5),
            yaxis = list(size = 0.5),
            zaxis = list(size = 0.5)))
# I display a summary of the probabilities of missclassification for each cluster
tiff(filename = "figures/Fig30.tiff", width = 400, height = 150, units = "mm", res = 300)
plot(res_varsel_pca, type="probs-class")
fig <- dev.off()
plot(res_varsel_pca, type="probs-class")

Then I want to evaluate possible relationship between clusters generated and histology class.

# I generate a cross table to check distribution of histology classes with partitions
table_hist_clust_res_varsel_pca <- table(lung_data_res_varsel_pca$Histology,lung_data_res_varsel_pca$partition)
addmargins(table_hist_clust_res_varsel_pca)
##                          
##                             1   2   3 Sum
##   adenocarcinoma           23   7  21  51
##   large cell               50  11  53 114
##   nos                      25   9  27  61
##   squamous cell carcinoma  49  22  80 151
##   Sum                     147  49 181 377
# I evaluate general association between clusters and histology class using chi 2 test
chisq.test(table_hist_clust_res_varsel_pca) 
## 
##  Pearson's Chi-squared test
## 
## data:  table_hist_clust_res_varsel_pca
## X-squared = 5.8419, df = 6, p-value = 0.4411
# I display distribution of histology class within different clusters
ggplot(lung_data_res_varsel_pca,aes(partition, fill = Histology)) + 
  geom_bar(position = 'fill',alpha = .5) +
  coord_flip()+
  theme_bw() 

ggsave("figures/Fig31.tiff",width = 100, height = 50, device="tiff", dpi=300, units = "mm", scale = 1)


# I continue with a correspondence analysis and generate an assymetric representation using 
# columns (clusters) repsented with the principal coordinates and histology classes
# represented with standard coordinates
ca.res_varsel_pca <- ca(table_hist_clust_res_varsel_pca)
tiff(filename = "figures/Fig32.tiff", width = 200, height = 200, units = "mm", res = 300)
plot(ca.res_varsel_pca, map= "colprincipal") 
fig <- dev.off()
plot(ca.res_varsel_pca, map= "colprincipal") 

# From CA analisis I get number of clusters with the greatest inertias to further compare cluster characteristics
clust_top_inertias <- data.frame(clust = ca.res_varsel_pca$colnames, inertia = get_ca_col(ca.res_varsel_pca)$inertia) %>% arrange(desc(inertia)) %>% top_n(3)

# After evaluating main clusters associated with different histology classes we can focus our analysis in 
# describing differences between these clusters:
lung_data_res_varsel_selected <- lung_data_res_varsel_pca %>% filter(partition %in% clust_top_inertias$clust) %>% droplevels()
lung_data_res_varsel_selected_table <- table(lung_data_res_varsel_selected$Histology,lung_data_res_varsel_selected$partition)



# I evaluate agreement using adjusted rand index (ARI) coefficient
cat('ARI (Histology class, Partitions):',ARI(lung_data_res_varsel_selected$Histology,lung_data_res_varsel_selected$partition))
## ARI (Histology class, Partitions): 0.00571581
# I repeat a chi2 test just with this clusters
chisq.test(lung_data_res_varsel_selected_table) 
## 
##  Pearson's Chi-squared test
## 
## data:  lung_data_res_varsel_selected_table
## X-squared = 5.8419, df = 6, p-value = 0.4411
# And I generate a cross table focusing on these clusters
crosstab3<- crosstable(lung_data_res_varsel_selected %>% select(.,-c(deadstatus.event,Survival.time)), 
           by = partition, test = TRUE) 
## Error in fisher.test(x, y): FEXACT error 6.  LDKEY=620 is too small for this problem,
##   (ii := key2[itp=287] = 967485, ldstp=18600)
## Try increasing the size of the workspace and possibly 'mult'
 (table25 <- crosstab3 %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable != 'Med [IQR]',
                   variable != 'N (NA)',
                   variable != 'Min / Max') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2())
## Error in separate(., test, into = c("pvalue", "test"), sep = "\n"): object 'crosstab3' not found
 table25 %>% save_kable("tables/table25.png")
## Error in save_kable(., "tables/table25.png"): object 'table25' not found

Finally I want to analyze survival probabilities among different clusters.

# I generate a specific dataframe including survival, dead status event and partition variables
df_surv_res_varsel_pca <- data.frame(Survival.time = lung_data_no_out$Survival.time, 
                                  deadstatus.event = lung_data_no_out$deadstatus.event,
                         partition = res_varsel_pca@partitions@zMAP)

# I create a survival object, and use it as response variable 
# to create Kaplan-Meier survival curves for each cluster
sfit_res_varsel_pca <- survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ partition, 
                            data = df_surv_res_varsel_pca)

# I plot survival curve including confidence intervals for each curve and 
# p value result from log-rank test to evaluate difference in survival between groups
tiff(filename = "figures/Fig33.tiff", width = 300, height = 200, units = "mm", res = 300)
(surv <- ggsurvplot(sfit_res_varsel_pca, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
           tables.height = 0.3))
fig <- dev.off()
(surv <- ggsurvplot(sfit_res_varsel_pca, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
           tables.height = 0.3))

min_max_surv <- surv_median(sfit_res_varsel_pca) %>% 
  filter(median == min(median) | median == max(median)) %>% 
  select(strata) %>% 
  separate(strata,
            into = c('string', 
                    'partition_num'),
            sep = "=",
            convert = TRUE) 
min_max_surv$partition_num
## [1] 2 3
# I repeat log-rank test comparing only the two most extreme median survival groups 
surv_pvalue(
  survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ partition, 
          data = df_surv_res_varsel_pca %>% filter(partition %in% min_max_surv$partition_num)),
  method = "survdiff",
  test.for.trend = FALSE,
  combine = FALSE
)
##    variable       pval   method  pval.txt
## 1 partition 0.01207893 Log-rank p = 0.012
# And describe composition for the clusters with most extreme results on survival analisis
crosstab4<- crosstable(lung_data_res_varsel_pca %>% filter(partition %in% min_max_surv$partition_num) %>% select(.,-c(deadstatus.event,Survival.time)) %>% droplevels(), 
           by = partition, test = TRUE)

 (table26 <- crosstab4 %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable != 'Med [IQR]',
                   variable != 'N (NA)',
                   variable != 'Min / Max') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2())
label variable 2 3 pval test
PC3 Mean (std) 0.5 (0.2) 0.2 (0.1) <0.0001 (Welch Two Sample t-test)
PC4 Mean (std) 0.5 (0.2) 0.7 (0.1) <0.0001 (Welch Two Sample t-test)
PC6 Mean (std) 0.4 (0.2) 0.2 (0.1) <0.0001 (Wilcoxon rank sum test)
PC1 Mean (std) 0.6 (0.3) 0.4 (0.1) 0.0039 (Wilcoxon rank sum test)
PC27 Mean (std) 0.6 (0.2) 0.5 (0.1) 0.0961 (Wilcoxon rank sum test)
PC22 Mean (std) 0.6 (0.2) 0.5 (0.1) 0.1370 (Wilcoxon rank sum test)
PC7 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.1503 (Welch Two Sample t-test)
PC5 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.1622 (Wilcoxon rank sum test)
PC31 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.1868 (Wilcoxon rank sum test)
PC15 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.2618 (Welch Two Sample t-test)
PC14 Mean (std) 0.5 (0.2) 0.6 (0.1) 0.2986 (Welch Two Sample t-test)
PC26 Mean (std) 0.6 (0.2) 0.6 (0.1) 0.2986 (Wilcoxon rank sum test)
PC12 Mean (std) 0.6 (0.2) 0.6 (0.1) 0.3317 (Welch Two Sample t-test)
PC34 Mean (std) 0.6 (0.2) 0.6 (0.1) 0.3693 (Welch Two Sample t-test)
PC10 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.4299 (Welch Two Sample t-test)
PC29 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.4608 (Welch Two Sample t-test)
PC19 Mean (std) 0.6 (0.2) 0.6 (0.1) 0.5271 (Welch Two Sample t-test)
age Mean (std) 0.6 (0.2) 0.6 (0.2) 0.5748 (Two Sample t-test)
PC37 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.5893 (Welch Two Sample t-test)
PC35 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.5951 (Welch Two Sample t-test)
PC11 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.6079 (Welch Two Sample t-test)
PC18 Mean (std) 0.6 (0.3) 0.7 (0.1) 0.6378 (Wilcoxon rank sum test)
PC17 Mean (std) 0.7 (0.2) 0.7 (0.1) 0.6413 (Wilcoxon rank sum test)
PC38 Mean (std) 0.6 (0.2) 0.6 (0.1) 0.6988 (Welch Two Sample t-test)
PC16 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.7181 (Welch Two Sample t-test)
PC32 Mean (std) 0.6 (0.2) 0.6 (0.1) 0.7229 (Welch Two Sample t-test)
PC13 Mean (std) 0.5 (0.3) 0.5 (0.1) 0.7236 (Welch Two Sample t-test)
PC24 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.7318 (Welch Two Sample t-test)
PC2 Mean (std) 0.5 (0.3) 0.5 (0.1) 0.7329 (Welch Two Sample t-test)
PC30 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.7454 (Welch Two Sample t-test)
PC40 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.7592 (Welch Two Sample t-test)
Histology adenocarcinoma 7 (25.00%) 21 (75.00%) 0.7596 (Pearson’s Chi-squared test)
Histology large cell 11 (17.19%) 53 (82.81%) 0.7596 (Pearson’s Chi-squared test)
Histology nos 9 (25.00%) 27 (75.00%) 0.7596 (Pearson’s Chi-squared test)
Histology squamous cell carcinoma 22 (21.57%) 80 (78.43%) 0.7596 (Pearson’s Chi-squared test)
Overall.Stage I 6 (18.18%) 27 (81.82%) 0.8179 (Fisher’s Exact Test for Count Data)
Overall.Stage II 6 (28.57%) 15 (71.43%) 0.8179 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIa 15 (21.13%) 56 (78.87%) 0.8179 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIb 22 (20.95%) 83 (79.05%) 0.8179 (Fisher’s Exact Test for Count Data)
PC39 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.8179 (Welch Two Sample t-test)
PC9 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.8409 (Welch Two Sample t-test)
PC21 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.8448 (Welch Two Sample t-test)
Manufacturer CMS Inc.  11 (20.37%) 43 (79.63%) 0.8480 (Pearson’s Chi-squared test)
Manufacturer SIEMENS 38 (21.59%) 138 (78.41%) 0.8480 (Pearson’s Chi-squared test)
PC23 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.8597 (Welch Two Sample t-test)
PC33 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.8706 (Welch Two Sample t-test)
PC8 Mean (std) 0.5 (0.2) 0.5 (0.1) 0.8721 (Wilcoxon rank sum test)
PC28 Mean (std) 0.4 (0.2) 0.4 (0.1) 0.8722 (Welch Two Sample t-test)
PC20 Mean (std) 0.6 (0.2) 0.6 (0.1) 0.9159 (Welch Two Sample t-test)
gender female 14 (20.90%) 53 (79.10%) 0.9227 (Pearson’s Chi-squared test)
gender male 35 (21.47%) 128 (78.53%) 0.9227 (Pearson’s Chi-squared test)
PC36 Mean (std) 0.6 (0.2) 0.6 (0.1) 0.9310 (Welch Two Sample t-test)
PC25 Mean (std) 0.5 (0.3) 0.5 (0.1) 0.9684 (Welch Two Sample t-test)
 table26 %>% save_kable("tables/table26.png")

Without additional variable selection

Data MRMR filtered Radiomics

I generate the model:

# I adjust VarSelCluster model without variables selection for this first
# approach and using BIC criteria to estimate the optimal number of clusters
set.seed(12345)
res_varsel <- VarSelCluster(x = lung_data_filtered_scaled_radiomics,
                          gvals = 2:10, 
                          nbcores = 4, 
                          vbleSelec = FALSE,  
                          crit.varsel = "BIC")

# I check the resulting model summary
summary(res_varsel)
## Model:
##    Number of components: 10 
##    Model selection has been performed according to the BIC  criterion
res_varsel@model@names.relevant
##  [1] "age"                                                    
##  [2] "original_shape_VoxelVolume"                             
##  [3] "wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis"       
##  [4] "wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis"        
##  [5] "log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis"
##  [6] "wavelet.HHL_glcm_ClusterProminence"                     
##  [7] "wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis"        
##  [8] "wavelet.HHH_firstorder_Kurtosis"                        
##  [9] "log.sigma.5.0.mm.3D_glcm_ClusterProminence"             
## [10] "wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis"        
## [11] "log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity"        
## [12] "wavelet.LLH_glcm_ClusterProminence"                     
## [13] "wavelet.HLL_glcm_Autocorrelation"                       
## [14] "log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis"
## [15] "wavelet.HHH_glszm_GrayLevelNonUniformity"               
## [16] "original_firstorder_Kurtosis"                           
## [17] "wavelet.HHH_firstorder_Median"                          
## [18] "log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis"
## [19] "wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis"  
## [20] "wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis"   
## [21] "wavelet.HHH_glrlm_GrayLevelVariance"                    
## [22] "original_gldm_LargeDependenceLowGrayLevelEmphasis"      
## [23] "wavelet.HLH_glcm_ClusterShade"                          
## [24] "wavelet.HHH_glcm_Imc2"                                  
## [25] "original_shape_Elongation"                              
## [26] "wavelet.LHH_glszm_SizeZoneNonUniformity"                
## [27] "log.sigma.2.0.mm.3D_glcm_ClusterShade"                  
## [28] "wavelet.HHH_firstorder_Skewness"                        
## [29] "log.sigma.2.0.mm.3D_firstorder_Kurtosis"                
## [30] "wavelet.LLH_glszm_SmallAreaEmphasis"                    
## [31] "wavelet.LLH_glcm_MCC"                                   
## [32] "wavelet.HHL_firstorder_Median"                          
## [33] "log.sigma.3.0.mm.3D_glcm_Autocorrelation"               
## [34] "wavelet.HLH_glszm_SmallAreaEmphasis"                    
## [35] "wavelet.HHL_gldm_DependenceNonUniformityNormalized"     
## [36] "wavelet.HHH_glszm_LowGrayLevelZoneEmphasis"             
## [37] "Overall.Stage"                                          
## [38] "gender"
res_varsel@model@names.irrelevant
## character(0)
# I check the discriminative power of the variables 
tiff(filename = "figures/Fig34.tiff", width = 400, height = 150, units = "mm", res = 300)
plot(res_varsel)
fig <- dev.off()
plot(res_varsel)

# I build q data frame including partitions, variables used in the model and other variables of interest
lung_data_res_varsel <- lung_data_filtered_scaled_radiomics %>% 
                              cbind(lung_data_no_out %>% select(Histology,
                                                                Survival.time,
                                                                deadstatus.event,
                                                                Manufacturer),
                                    partition = as.factor(res_varsel@partitions@zMAP))


# I generate a representation of the clusters using the main discriminative variables to define the axes
# (screencapture saved as figure 35)
plot_ly(lung_data_res_varsel, x=~original_shape_VoxelVolume, y=~wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis,
        z=~wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis, color=~partition) %>%
        add_markers(size=1.5)%>%
        layout(
          scene = list(
            xaxis = list(size = 0.5),
            yaxis = list(size = 0.5),
            zaxis = list(size = 0.5)))
# I display a summary of the probabilities of missclassification for each cluster
tiff(filename = "figures/Fig36.tiff", width = 400, height = 150, units = "mm", res = 300)
plot(res_varsel, type="probs-class")
fig <- dev.off()
plot(res_varsel, type="probs-class")

Then I want to evaluate possible relationship between clusters generated and histology class.

# I generate a cross table to check distribution of histology classes with partitions
table_hist_clust_res_varsel <- table(lung_data_res_varsel$Histology,lung_data_res_varsel$partition)
addmargins(table_hist_clust_res_varsel)
##                          
##                             1   2   3   4   5   6   7   8   9  10 Sum
##   adenocarcinoma            4   5   7   3   4   9   6   4   3   6  51
##   large cell               12   9  16   7  13  12  26   6   8   5 114
##   nos                       6   6  11   2   4   7  11   4   5   5  61
##   squamous cell carcinoma  17   6  17  11  13  26  21   6  22  12 151
##   Sum                      39  26  51  23  34  54  64  20  38  28 377
# I evaluate general association between clusters and histology class using chi 2 test
chisq.test(table_hist_clust_res_varsel) 
## 
##  Pearson's Chi-squared test
## 
## data:  table_hist_clust_res_varsel
## X-squared = 23.823, df = 27, p-value = 0.6401
# I display distribution of histology class within different clusters
ggplot(lung_data_res_varsel,aes(partition, fill = Histology)) + 
  geom_bar(position = 'fill',alpha = .5) +
  coord_flip()+
  theme_bw() 

ggsave("figures/Fig37.tiff",width = 100, height = 50, device="tiff", dpi=300, units = "mm", scale = 1)


# I continue with a correspondence analysis and generate an assymetric representation using 
# columns (clusters) repsented with the principal coordinates and histology classes
# represented with standard coordinates
ca.res_varsel <- ca(table_hist_clust_res_varsel)
tiff(filename = "figures/Fig38.tiff", width = 200, height = 200, units = "mm", res = 300)
plot(ca.res_varsel, map= "colprincipal") 
fig <- dev.off()
plot(ca.res_varsel, map= "colprincipal") 

# From CA analisis I get number of clusters with the greatest inertias to further compare cluster characteristics
clust_top_inertias <- data.frame(clust = ca.res_varsel$colnames, inertia = get_ca_col(ca.res_varsel)$inertia) %>% arrange(desc(inertia)) %>% top_n(3)

# After evaluating main clusters associated with different histology classes we can focus our analysis in 
# describing differences between these clusters:
lung_data_res_varsel_selected <- lung_data_res_varsel %>% filter(partition %in% clust_top_inertias$clust) %>% droplevels()
lung_data_res_varsel_selected_table <- table(lung_data_res_varsel_selected$Histology,lung_data_res_varsel_selected$partition)

# I evaluate agreement using adjusted rand index (ARI) coefficient
cat('ARI (Histology class, Partitions):',ARI(lung_data_res_varsel_selected$Histology,lung_data_res_varsel_selected$partition))
## ARI (Histology class, Partitions): 0.02951045
# I repeat a chi2 test just with this clusters
chisq.test(lung_data_res_varsel_selected_table) 
## 
##  Pearson's Chi-squared test
## 
## data:  lung_data_res_varsel_selected_table
## X-squared = 11.601, df = 6, p-value = 0.07149
# And I generate a cross table focusing on these clusters
crosstab5<- crosstable(lung_data_res_varsel_selected %>% select(.,-c(deadstatus.event,Survival.time)), 
           by = partition, test = TRUE)

 (table27 <- crosstab5 %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable != 'Med [IQR]',
                   variable != 'N (NA)',
                   variable != 'Min / Max') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2())
label variable 2 7 9 pval test
original_shape_VoxelVolume Mean (std) 0.6 (0.2) 0.5 (0.1) 0.6 (0.1) <0.0001 (One-way analysis of means (not assuming equal variances))
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis Mean (std) 0.6 (0.1) 0.5 (0.1) 0.6 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.6 (0.1) 0.4 (0.1) 0.5 (0.1) <0.0001 (One-way analysis of means)
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.2 (0.1) 0.01 (0.01) 0.03 (0.02) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_glcm_ClusterProminence Mean (std) 0.01 (0.01) 0.02 (0.02) 0.009 (0.007) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.2 (0.1) 0.1 (0.1) 0.3 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHH_firstorder_Kurtosis Mean (std) 0.02 (0.01) 0.03 (0.01) 0.1 (0.03) <0.0001 (Kruskal-Wallis rank sum test)
log.sigma.5.0.mm.3D_glcm_ClusterProminence Mean (std) 0.1 (0.1) 0.3 (0.1) 0.3 (0.2) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.7 (0.1) 0.6 (0.1) 0.7 (0.1) <0.0001 (One-way analysis of means)
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity Mean (std) 0.03 (0.02) 0.04 (0.02) 0.1 (0.2) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.LLH_glcm_ClusterProminence Mean (std) 0.009 (0.004) 0.04 (0.03) 0.1 (0.04) <0.0001 (Kruskal-Wallis rank sum test)
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis Mean (std) 0.1 (0.1) 0.04 (0.04) 0.02 (0.01) <0.0001 (Kruskal-Wallis rank sum test)
original_firstorder_Kurtosis Mean (std) 0.1 (0.1) 0.03 (0.02) 0.04 (0.03) <0.0001 (Kruskal-Wallis rank sum test)
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.2 (0.1) 0.03 (0.02) 0.1 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis Mean (std) 0.1 (0.1) 0.1 (0.04) 0.1 (0.02) <0.0001 (Kruskal-Wallis rank sum test)
original_gldm_LargeDependenceLowGrayLevelEmphasis Mean (std) 0.1 (0.03) 0.04 (0.02) 0.1 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.LLH_glcm_MCC Mean (std) 0.4 (0.2) 0.5 (0.1) 0.6 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
log.sigma.3.0.mm.3D_glcm_Autocorrelation Mean (std) 0.04 (0.03) 0.1 (0.02) 0.1 (0.04) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HLH_glszm_SmallAreaEmphasis Mean (std) 0.7 (0.2) 0.6 (0.1) 0.5 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_gldm_DependenceNonUniformityNormalized Mean (std) 0.2 (0.1) 0.2 (0.1) 0.3 (0.1) <0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHH_glszm_GrayLevelNonUniformity Mean (std) 0.03 (0.03) 0.01 (0.01) 0.03 (0.02) 0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis Mean (std) 0.02 (0.02) 0.03 (0.02) 0.01 (0.006) 0.0001 (Kruskal-Wallis rank sum test)
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis Mean (std) 0.6 (0.2) 0.5 (0.2) 0.7 (0.2) 0.0002 (One-way analysis of means)
log.sigma.2.0.mm.3D_firstorder_Kurtosis Mean (std) 0.1 (0.1) 0.04 (0.02) 0.04 (0.02) 0.0007 (Kruskal-Wallis rank sum test)
wavelet.HLL_glcm_Autocorrelation Mean (std) 0.04 (0.02) 0.1 (0.04) 0.1 (0.04) 0.0010 (Kruskal-Wallis rank sum test)
wavelet.HHH_glcm_Imc2 Mean (std) 0.5 (0.2) 0.4 (0.2) 0.3 (0.2) 0.0020 (Kruskal-Wallis rank sum test)
wavelet.HHL_firstorder_Median Mean (std) 0.4 (0.03) 0.4 (0.05) 0.4 (0.02) 0.0155 (One-way analysis of means (not assuming equal variances))
wavelet.HLH_glcm_ClusterShade Mean (std) 0.6 (0.005) 0.6 (0.01) 0.6 (0.01) 0.0554 (Kruskal-Wallis rank sum test)
Histology adenocarcinoma 5 (35.71%) 6 (42.86%) 3 (21.43%) 0.0761 (Fisher’s Exact Test for Count Data)
Histology large cell 9 (20.93%) 26 (60.47%) 8 (18.60%) 0.0761 (Fisher’s Exact Test for Count Data)
Histology nos 6 (27.27%) 11 (50.00%) 5 (22.73%) 0.0761 (Fisher’s Exact Test for Count Data)
Histology squamous cell carcinoma 6 (12.24%) 21 (42.86%) 22 (44.90%) 0.0761 (Fisher’s Exact Test for Count Data)
wavelet.LLH_glszm_SmallAreaEmphasis Mean (std) 0.5 (0.2) 0.5 (0.1) 0.5 (0.1) 0.1476 (One-way analysis of means (not assuming equal variances))
wavelet.HHH_glrlm_GrayLevelVariance Mean (std) 0.03 (0.008) 0.03 (0.01) 0.03 (0.001) 0.1642 (Kruskal-Wallis rank sum test)
wavelet.LHH_glszm_SizeZoneNonUniformity Mean (std) 0.1 (0.2) 0.1 (0.04) 0.1 (0.1) 0.1752 (Kruskal-Wallis rank sum test)
log.sigma.2.0.mm.3D_glcm_ClusterShade Mean (std) 0.9 (0.002) 0.9 (0.008) 0.9 (0.01) 0.1992 (Kruskal-Wallis rank sum test)
original_shape_Elongation Mean (std) 0.8 (0.1) 0.7 (0.1) 0.7 (0.1) 0.2570 (One-way analysis of means)
age Mean (std) 0.6 (0.2) 0.6 (0.2) 0.6 (0.2) 0.3623 (One-way analysis of means)
wavelet.HHH_firstorder_Skewness Mean (std) 0.4 (0.04) 0.4 (0.1) 0.4 (0.1) 0.4862 (Kruskal-Wallis rank sum test)
gender female 4 (13.79%) 17 (58.62%) 8 (27.59%) 0.4972 (Pearson’s Chi-squared test)
gender male 22 (22.22%) 47 (47.47%) 30 (30.30%) 0.4972 (Pearson’s Chi-squared test)
Overall.Stage I 5 (27.78%) 10 (55.56%) 3 (16.67%) 0.6477 (Fisher’s Exact Test for Count Data)
Overall.Stage II 3 (20.00%) 8 (53.33%) 4 (26.67%) 0.6477 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIa 7 (20.59%) 19 (55.88%) 8 (23.53%) 0.6477 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIb 11 (18.03%) 27 (44.26%) 23 (37.70%) 0.6477 (Fisher’s Exact Test for Count Data)
Manufacturer CMS Inc.  5 (20.83%) 13 (54.17%) 6 (25.00%) 0.8343 (Fisher’s Exact Test for Count Data)
Manufacturer SIEMENS 21 (20.19%) 51 (49.04%) 32 (30.77%) 0.8343 (Fisher’s Exact Test for Count Data)
wavelet.HHH_firstorder_Median Mean (std) 0.6 (0.02) 0.6 (0.03) 0.6 (0.01) 0.8429 (One-way analysis of means (not assuming equal variances))
 table27 %>% save_kable("tables/table27.png")

Finally I want to analyze survival probabilities among different clusters.

# I generate a specific dataframe including survival, dead status event and partition variables
df_surv_res_varsel <- data.frame(Survival.time = lung_data_no_out$Survival.time, 
                                  deadstatus.event = lung_data_no_out$deadstatus.event,
                         partition = res_varsel@partitions@zMAP)

# I create a survival object, and use it as response variable 
# to create Kaplan-Meier survival curves for each cluster
sfit_res_varsel <- survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ partition, 
                            data = df_surv_res_varsel)

# I plot survival curve including confidence intervals for each curve and 
# p value result from log-rank test to evaluate difference in survival between groups
tiff(filename = "figures/Fig39.tiff", width = 300, height = 200, units = "mm", res = 300)
(surv <- ggsurvplot(sfit_res_varsel, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
           tables.height = 0.3))
fig <- dev.off()
(surv <- ggsurvplot(sfit_res_varsel, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
           tables.height = 0.3))

min_max_surv <- surv_median(sfit_res_varsel) %>% 
  filter(median == min(median) | median == max(median)) %>% 
  select(strata) %>% 
  separate(strata,
            into = c('string', 
                    'partition_num'),
            sep = "=",
            convert = TRUE) 
min_max_surv$partition_num
## [1] 1 6
# I repeat log-rank test comparing only the two most extreme median survival groups 
surv_pvalue(
  survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ partition, 
          data = df_surv_res_varsel %>% filter(partition %in% min_max_surv$partition_num)),
  method = "survdiff",
  test.for.trend = FALSE,
  combine = FALSE
)
##    variable        pval   method   pval.txt
## 1 partition 0.007528379 Log-rank p = 0.0075
# And describe composition for the clusters with most extreme results on survival analisis
crosstab6<- crosstable(lung_data_res_varsel %>% filter(partition %in% min_max_surv$partition_num) %>% select(.,-c(deadstatus.event,Survival.time)) %>% droplevels(), 
           by = partition, test = TRUE) 

 (table28 <- crosstab6 %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable != 'Med [IQR]',
                   variable != 'N (NA)',
                   variable != 'Min / Max') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2())
label variable 1 6 pval test
original_shape_VoxelVolume Mean (std) 0.7 (0.1) 0.2 (0.1) <0.0001 (Two Sample t-test)
wavelet.HLL_glszm_LargeAreaHighGrayLevelEmphasis Mean (std) 0.7 (0.1) 0.4 (0.1) <0.0001 (Two Sample t-test)
wavelet.HLH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.5 (0.1) 0.3 (0.1) <0.0001 (Two Sample t-test)
log.sigma.2.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.1 (0.1) 0.002 (0.002) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_glcm_ClusterProminence Mean (std) 0.04 (0.02) 0.01 (0.01) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.6 (0.1) 0.4 (0.1) <0.0001 (Wilcoxon rank sum test)
log.sigma.5.0.mm.3D_glszm_SizeZoneNonUniformity Mean (std) 0.1 (0.1) 0.03 (0.03) <0.0001 (Wilcoxon rank sum test)
wavelet.HLL_glcm_Autocorrelation Mean (std) 0.1 (0.05) 0.04 (0.03) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glszm_GrayLevelNonUniformity Mean (std) 0.2 (0.2) 0.01 (0.009) <0.0001 (Wilcoxon rank sum test)
original_firstorder_Kurtosis Mean (std) 0.05 (0.03) 0.02 (0.02) <0.0001 (Wilcoxon rank sum test)
log.sigma.1.0.mm.3D_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.1 (0.1) 0.005 (0.004) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_gldm_SmallDependenceHighGrayLevelEmphasis Mean (std) 0.1 (0.03) 0.01 (0.009) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_gldm_SmallDependenceLowGrayLevelEmphasis Mean (std) 0.05 (0.03) 0.2 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glrlm_GrayLevelVariance Mean (std) 0.1 (0.1) 0.02 (4e-04) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glcm_Imc2 Mean (std) 0.5 (0.1) 0.3 (0.2) <0.0001 (Wilcoxon rank sum test)
wavelet.LHH_glszm_SizeZoneNonUniformity Mean (std) 0.2 (0.2) 0.02 (0.01) <0.0001 (Wilcoxon rank sum test)
log.sigma.2.0.mm.3D_firstorder_Kurtosis Mean (std) 0.1 (0.03) 0.02 (0.01) <0.0001 (Wilcoxon rank sum test)
log.sigma.3.0.mm.3D_glcm_Autocorrelation Mean (std) 0.1 (0.05) 0.04 (0.02) <0.0001 (Wilcoxon rank sum test)
wavelet.HHL_gldm_DependenceNonUniformityNormalized Mean (std) 0.2 (0.1) 0.1 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_glszm_LowGrayLevelZoneEmphasis Mean (std) 0.4 (0.1) 0.7 (0.1) <0.0001 (Wilcoxon rank sum test)
wavelet.HHH_firstorder_Kurtosis Mean (std) 0.1 (0.03) 0.03 (0.02) 0.0001 (Wilcoxon rank sum test)
wavelet.HHL_glszm_LargeAreaLowGrayLevelEmphasis Mean (std) 0.2 (0.1) 0.1 (0.1) 0.0004 (Wilcoxon rank sum test)
wavelet.LLH_glszm_SmallAreaEmphasis Mean (std) 0.5 (0.1) 0.5 (0.1) 0.0004 (Wilcoxon rank sum test)
log.sigma.5.0.mm.3D_glcm_ClusterProminence Mean (std) 0.3 (0.1) 0.4 (0.1) 0.0007 (Wilcoxon rank sum test)
Manufacturer CMS Inc.  14 (70.00%) 6 (30.00%) 0.0041 (Pearson’s Chi-squared test)
Manufacturer SIEMENS 25 (34.25%) 48 (65.75%) 0.0041 (Pearson’s Chi-squared test)
wavelet.LLH_glcm_ClusterProminence Mean (std) 0.04 (0.02) 0.1 (0.04) 0.0398 (Wilcoxon rank sum test)
wavelet.HHL_firstorder_Median Mean (std) 0.4 (0.03) 0.4 (0.1) 0.0486 (Welch Two Sample t-test)
age Mean (std) 0.6 (0.2) 0.6 (0.2) 0.0847 (Two Sample t-test)
wavelet.LLH_glcm_MCC Mean (std) 0.5 (0.1) 0.6 (0.1) 0.1911 (Two Sample t-test)
wavelet.HLH_glcm_ClusterShade Mean (std) 0.6 (0.01) 0.6 (0.04) 0.2826 (Wilcoxon rank sum test)
gender female 11 (34.38%) 21 (65.62%) 0.2845 (Pearson’s Chi-squared test)
gender male 28 (45.90%) 33 (54.10%) 0.2845 (Pearson’s Chi-squared test)
wavelet.HHH_firstorder_Skewness Mean (std) 0.4 (0.1) 0.4 (0.1) 0.3304 (Wilcoxon rank sum test)
Overall.Stage I 6 (31.58%) 13 (68.42%) 0.4106 (Pearson’s Chi-squared test)
Overall.Stage II 6 (50.00%) 6 (50.00%) 0.4106 (Pearson’s Chi-squared test)
Overall.Stage IIIa 11 (35.48%) 20 (64.52%) 0.4106 (Pearson’s Chi-squared test)
Overall.Stage IIIb 16 (51.61%) 15 (48.39%) 0.4106 (Pearson’s Chi-squared test)
wavelet.HLH_glszm_SmallAreaEmphasis Mean (std) 0.6 (0.1) 0.5 (0.2) 0.4455 (Wilcoxon rank sum test)
log.sigma.5.0.mm.3D_glszm_SmallAreaLowGrayLevelEmphasis Mean (std) 0.03 (0.01) 0.03 (0.02) 0.5283 (Wilcoxon rank sum test)
original_gldm_LargeDependenceLowGrayLevelEmphasis Mean (std) 0.04 (0.02) 0.05 (0.03) 0.5804 (Wilcoxon rank sum test)
Histology adenocarcinoma 4 (30.77%) 9 (69.23%) 0.6815 (Pearson’s Chi-squared test)
Histology large cell 12 (50.00%) 12 (50.00%) 0.6815 (Pearson’s Chi-squared test)
Histology nos 6 (46.15%) 7 (53.85%) 0.6815 (Pearson’s Chi-squared test)
Histology squamous cell carcinoma 17 (39.53%) 26 (60.47%) 0.6815 (Pearson’s Chi-squared test)
log.sigma.2.0.mm.3D_glcm_ClusterShade Mean (std) 0.9 (0.006) 0.9 (0.008) 0.7378 (Wilcoxon rank sum test)
original_shape_Elongation Mean (std) 0.7 (0.1) 0.7 (0.2) 0.9814 (Wilcoxon rank sum test)
wavelet.HHH_firstorder_Median Mean (std) 0.6 (0.02) 0.6 (0.05) 0.9914 (Welch Two Sample t-test)
 table28 %>% save_kable("tables/table28.png")
Data PCA Radiomics
set.seed(12345)

# I adjust VarSelCluster model without variables selection for this first
# approach and using BIC criteria to estimate the number of partitions
res_varsel_pca <- VarSelCluster(x = lung_data_pca_scaled_radiomics,
                          gvals = 2:10, 
                          nbcores = 4, 
                          vbleSelec = FALSE,  
                          crit.varsel = "BIC")

# I check the resulting model summary
summary(res_varsel_pca)
## Model:
##    Number of components: 3 
##    Model selection has been performed according to the BIC  criterion
res_varsel_pca@model
## An object of class "VSLCMmodel"
## Slot "g":
## [1] 3
## 
## Slot "omega":
##           age           PC1           PC2           PC3           PC4 
##             1             1             1             1             1 
##           PC5           PC6           PC7           PC8           PC9 
##             1             1             1             1             1 
##          PC10          PC11          PC12          PC13          PC14 
##             1             1             1             1             1 
##          PC15          PC16          PC17          PC18          PC19 
##             1             1             1             1             1 
##          PC20          PC21          PC22          PC23          PC24 
##             1             1             1             1             1 
##          PC25          PC26          PC27          PC28          PC29 
##             1             1             1             1             1 
##          PC30          PC31          PC32          PC33          PC34 
##             1             1             1             1             1 
##          PC35          PC36          PC37          PC38          PC39 
##             1             1             1             1             1 
##          PC40 Overall.Stage        gender 
##             1             1             1 
## 
## Slot "names.relevant":
##  [1] "age"           "PC1"           "PC2"           "PC3"          
##  [5] "PC4"           "PC5"           "PC6"           "PC7"          
##  [9] "PC8"           "PC9"           "PC10"          "PC11"         
## [13] "PC12"          "PC13"          "PC14"          "PC15"         
## [17] "PC16"          "PC17"          "PC18"          "PC19"         
## [21] "PC20"          "PC21"          "PC22"          "PC23"         
## [25] "PC24"          "PC25"          "PC26"          "PC27"         
## [29] "PC28"          "PC29"          "PC30"          "PC31"         
## [33] "PC32"          "PC33"          "PC34"          "PC35"         
## [37] "PC36"          "PC37"          "PC38"          "PC39"         
## [41] "PC40"          "Overall.Stage" "gender"       
## 
## Slot "names.irrelevant":
## character(0)
# I check the discriminative power of the variables 
tiff(filename = "figures/Fig40.tiff", width = 400, height = 150, units = "mm", res = 300)
plot(res_varsel_pca)
fig <- dev.off()
plot(res_varsel_pca)

# I build q data frame including partitions, variables used in the model and other variables of interest
lung_data_res_varsel_pca <- lung_data_pca_scaled_radiomics %>% 
                              cbind(lung_data_no_out %>% select(Histology,
                                                                Survival.time,
                                                                deadstatus.event,
                                                                Manufacturer),
                                    partition = as.factor(res_varsel_pca@partitions@zMAP))


# I generate a representation of the clusters using the main discriminative variables to define the axes
# (screencapture saved as figure 41)
plot_ly(lung_data_res_varsel_pca, x=~PC1, y=~PC2,
        z=~PC3, color=~partition) %>%
        add_markers(size=1.5)%>%
        layout(
          scene = list(
            xaxis = list(size = 0.5),
            yaxis = list(size = 0.5),
            zaxis = list(size = 0.5)))
# I display a summary of the probabilities of missclassification for each cluster
tiff(filename = "figures/Fig42.tiff", width = 400, height = 150, units = "mm", res = 300)
plot(res_varsel_pca, type="probs-class")
fig <- dev.off()
plot(res_varsel_pca, type="probs-class")

Then I want to evaluate possible relationship between clusters generated and histology class.

# I generate a cross table to check distribution of histology classes with partitions
table_hist_clust_res_varsel_pca <- table(lung_data_res_varsel_pca$Histology,lung_data_res_varsel_pca$partition)
addmargins(table_hist_clust_res_varsel_pca)
##                          
##                             1   2   3 Sum
##   adenocarcinoma           21   7  23  51
##   large cell               51  11  52 114
##   nos                      27   9  25  61
##   squamous cell carcinoma  79  22  50 151
##   Sum                     178  49 150 377
# I evaluate general association between clusters and histology class using chi 2 test
chisq.test(table_hist_clust_res_varsel_pca) 
## 
##  Pearson's Chi-squared test
## 
## data:  table_hist_clust_res_varsel_pca
## X-squared = 5.9707, df = 6, p-value = 0.4265
# I display distribution of histology class within different clusters
ggplot(lung_data_res_varsel_pca,aes(partition, fill = Histology)) + 
  geom_bar(position = 'fill',alpha = .5) +
  coord_flip()+
  theme_bw() 

ggsave("figures/Fig43.tiff",width = 100, height = 50, device="tiff", dpi=300, units = "mm", scale = 1)


# I continue with a correspondence analysis and generate an assymetric representation using 
# columns (clusters) repsented with the principal coordinates and histology classes
# represented with standard coordinates
ca.res_varsel_pca <- ca(table_hist_clust_res_varsel_pca)
tiff(filename = "figures/Fig44.tiff", width = 200, height = 200, units = "mm", res = 300)
plot(ca.res_varsel_pca, map= "colprincipal") 
fig <- dev.off()
plot(ca.res_varsel_pca, map= "colprincipal") 

# From CA analisis I get number of clusters with the greatest inertias to further compare cluster characteristics
clust_top_inertias <- data.frame(clust = ca.res_varsel_pca$colnames, inertia = get_ca_col(ca.res_varsel_pca)$inertia) %>% arrange(desc(inertia)) %>% top_n(3)

# After evaluating main clusters associated with different histology classes we can focus our analysis in 
# describing differences between these clusters:
lung_data_res_varsel_selected <- lung_data_res_varsel_pca %>% filter(partition %in% clust_top_inertias$clust) %>% droplevels()
lung_data_res_varsel_selected_table <- table(lung_data_res_varsel_selected$Histology,lung_data_res_varsel_selected$partition)


# I evaluate agreement using adjusted rand index (ARI) coefficient
cat('ARI (Histology class, Partitions):',ARI(lung_data_res_varsel_selected$Histology,lung_data_res_varsel_selected$partition))
## ARI (Histology class, Partitions): 0.005437933
# I repeat a chi2 test just with this clusters
chisq.test(lung_data_res_varsel_selected_table) 
## 
##  Pearson's Chi-squared test
## 
## data:  lung_data_res_varsel_selected_table
## X-squared = 5.9707, df = 6, p-value = 0.4265
# And I generate a cross table focusing on these clusters
crosstab7<- crosstable(lung_data_res_varsel_selected %>% select(.,-c(deadstatus.event,Survival.time)), 
           by = partition, test = TRUE)
## Error in fisher.test(x, y): FEXACT error 6.  LDKEY=620 is too small for this problem,
##   (ii := key2[itp=289] = 1168357, ldstp=18600)
## Try increasing the size of the workspace and possibly 'mult'
 (table29 <- crosstab7 %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable != 'Med [IQR]',
                   variable != 'N (NA)',
                   variable != 'Min / Max') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2())
## Error in separate(., test, into = c("pvalue", "test"), sep = "\n"): object 'crosstab7' not found
 table29 %>% save_kable("tables/table29.png")
## Error in save_kable(., "tables/table29.png"): object 'table29' not found

Finally I want to analyze survival probabilities among different clusters.

# I generate a specific dataframe including survival, dead status event and partition variables
df_surv_res_varsel_pca <- data.frame(Survival.time = lung_data_no_out$Survival.time, 
                                  deadstatus.event = lung_data_no_out$deadstatus.event,
                         partition = res_varsel_pca@partitions@zMAP)

# I create a survival object, and use it as response variable 
# to create Kaplan-Meier survival curves for each cluster
sfit_res_varsel_pca <- survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ partition, 
                            data = df_surv_res_varsel_pca)

# I plot survival curve including confidence intervals for each curve and 
# p value result from log-rank test to evaluate difference in survival between groups
tiff(filename = "figures/Fig45.tiff", width = 300, height = 200, units = "mm", res = 300)
(surv <- ggsurvplot(sfit_res_varsel_pca, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
           tables.height = 0.3))
fig <- dev.off()
(surv <- ggsurvplot(sfit_res_varsel_pca, 
           surv.median.line = 'hv',
           pval=TRUE, 
           risk.table='percentage',
           surv.plot.height = 0.7,
           tables.height = 0.3))

min_max_surv <- surv_median(sfit_res_varsel_pca) %>% 
  filter(median == min(median) | median == max(median)) %>% 
  select(strata) %>% 
  separate(strata,
            into = c('string', 
                    'partition_num'),
            sep = "=",
            convert = TRUE) 
min_max_surv$partition_num
## [1] 1 2
# I repeat log-rank test comparing only the two most extreme median survival groups 
surv_pvalue(
  survfit(Surv(Survival.time, as.numeric(deadstatus.event)) ~ partition, 
          data = df_surv_res_varsel_pca %>% filter(partition %in% min_max_surv$partition_num)),
  method = "survdiff",
  test.for.trend = FALSE,
  combine = FALSE
)
##    variable       pval   method  pval.txt
## 1 partition 0.01364493 Log-rank p = 0.014
# And describe composition for the clusters with most extreme results on survival analisis
crosstab8<- crosstable(lung_data_res_varsel_pca %>% filter(partition %in% min_max_surv$partition_num) %>% select(.,-c(deadstatus.event,Survival.time)) %>% droplevels(), 
           by = partition, test = TRUE)

 (table30 <- crosstab8 %>% separate(test,
            into = c('pvalue', 
                    'test'),
            sep = "\n") %>% 
            separate(pvalue,
            into = c('string', 
                    'pval'),
            sep = ":",
            convert = TRUE) %>% 
            arrange(pval) %>% 
            filter(variable != 'Med [IQR]',
                   variable != 'N (NA)',
                   variable != 'Min / Max') %>% 
            dplyr::select(.,-c(string,.id)) %>% kable(full_with = FALSE) %>% kable_classic_2())
label variable 1 2 pval test
PC3 Mean (std) 0.2 (0.1) 0.5 (0.2) <0.0001 (Welch Two Sample t-test)
PC4 Mean (std) 0.7 (0.1) 0.5 (0.2) <0.0001 (Welch Two Sample t-test)
PC6 Mean (std) 0.2 (0.1) 0.4 (0.2) <0.0001 (Wilcoxon rank sum test)
PC1 Mean (std) 0.4 (0.1) 0.6 (0.3) 0.0040 (Wilcoxon rank sum test)
PC27 Mean (std) 0.5 (0.1) 0.6 (0.2) 0.0892 (Wilcoxon rank sum test)
PC22 Mean (std) 0.5 (0.1) 0.6 (0.2) 0.1278 (Wilcoxon rank sum test)
PC7 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.1445 (Welch Two Sample t-test)
PC5 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.1697 (Wilcoxon rank sum test)
PC31 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.1955 (Wilcoxon rank sum test)
PC15 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.2660 (Welch Two Sample t-test)
PC26 Mean (std) 0.6 (0.1) 0.6 (0.2) 0.3034 (Wilcoxon rank sum test)
PC14 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.3288 (Welch Two Sample t-test)
PC12 Mean (std) 0.6 (0.1) 0.6 (0.2) 0.3356 (Welch Two Sample t-test)
PC34 Mean (std) 0.6 (0.1) 0.6 (0.2) 0.3933 (Welch Two Sample t-test)
PC10 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.4330 (Welch Two Sample t-test)
PC29 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.4655 (Welch Two Sample t-test)
PC19 Mean (std) 0.6 (0.1) 0.6 (0.2) 0.5281 (Welch Two Sample t-test)
age Mean (std) 0.6 (0.2) 0.6 (0.2) 0.5588 (Two Sample t-test)
PC35 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.5650 (Welch Two Sample t-test)
PC17 Mean (std) 0.7 (0.1) 0.7 (0.2) 0.6077 (Wilcoxon rank sum test)
PC18 Mean (std) 0.7 (0.1) 0.6 (0.3) 0.6146 (Wilcoxon rank sum test)
PC37 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.6149 (Welch Two Sample t-test)
PC11 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.6217 (Welch Two Sample t-test)
PC38 Mean (std) 0.6 (0.1) 0.6 (0.2) 0.6575 (Welch Two Sample t-test)
PC8 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.7176 (Welch Two Sample t-test)
PC16 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.7189 (Welch Two Sample t-test)
PC32 Mean (std) 0.6 (0.1) 0.6 (0.2) 0.7220 (Welch Two Sample t-test)
PC24 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.7250 (Welch Two Sample t-test)
PC13 Mean (std) 0.5 (0.1) 0.5 (0.3) 0.7351 (Welch Two Sample t-test)
PC2 Mean (std) 0.5 (0.1) 0.5 (0.3) 0.7375 (Welch Two Sample t-test)
PC30 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.7650 (Welch Two Sample t-test)
Overall.Stage I 26 (81.25%) 6 (18.75%) 0.7866 (Fisher’s Exact Test for Count Data)
Overall.Stage II 14 (70.00%) 6 (30.00%) 0.7866 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIa 55 (78.57%) 15 (21.43%) 0.7866 (Fisher’s Exact Test for Count Data)
Overall.Stage IIIb 83 (79.05%) 22 (20.95%) 0.7866 (Fisher’s Exact Test for Count Data)
PC40 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.8017 (Welch Two Sample t-test)
Manufacturer CMS Inc.  43 (79.63%) 11 (20.37%) 0.8036 (Pearson’s Chi-squared test)
Manufacturer SIEMENS 135 (78.03%) 38 (21.97%) 0.8036 (Pearson’s Chi-squared test)
Histology adenocarcinoma 21 (75.00%) 7 (25.00%) 0.8051 (Pearson’s Chi-squared test)
Histology large cell 51 (82.26%) 11 (17.74%) 0.8051 (Pearson’s Chi-squared test)
Histology nos 27 (75.00%) 9 (25.00%) 0.8051 (Pearson’s Chi-squared test)
Histology squamous cell carcinoma 79 (78.22%) 22 (21.78%) 0.8051 (Pearson’s Chi-squared test)
PC39 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.8125 (Welch Two Sample t-test)
PC23 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.8312 (Welch Two Sample t-test)
PC21 Mean (std) 0.5 (0.1) 0.5 (0.2) 0.8382 (Welch Two Sample t-test)
PC33 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.8443 (Welch Two Sample t-test)
PC9 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.8695 (Welch Two Sample t-test)
PC25 Mean (std) 0.5 (0.1) 0.5 (0.3) 0.8988 (Welch Two Sample t-test)
PC28 Mean (std) 0.4 (0.1) 0.4 (0.2) 0.9035 (Welch Two Sample t-test)
PC36 Mean (std) 0.6 (0.1) 0.6 (0.2) 0.9358 (Welch Two Sample t-test)
PC20 Mean (std) 0.6 (0.1) 0.6 (0.2) 0.9372 (Welch Two Sample t-test)
gender female 51 (78.46%) 14 (21.54%) 0.9912 (Pearson’s Chi-squared test)
gender male 127 (78.40%) 35 (21.60%) 0.9912 (Pearson’s Chi-squared test)
 table30 %>% save_kable("tables/table30.png")